Code Library: Web Scraping + AI (Example: News Extraction & Analysis)

 



1️⃣ Scraping News Websites (Using requests & BeautifulSoup)

import requests
from bs4 import BeautifulSoup
# Target URL
url = "https://www.bbc.com/news"
# Fetch the page content
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
# Extract headlines (BBC uses h3 tags for main news)
headlines = soup.find_all("h3")
# Print the first 10 headlines
for idx, headline in enumerate(headlines[:10]):
print(f"{idx+1}. {headline.text.strip()}")

Explanation:

  • requests.get(url): Fetch HTML content.

  • BeautifulSoup(response.text, 'html.parser'): Parse HTML.

  • soup.find_all("h3"): Extract all <h3> elements (usually news titles).


2️⃣ Cleaning News Text

# Strip whitespace and unnecessary characters
cleaned_headlines = [headline.text.strip() for headline in headlines[:10]]
print(cleaned_headlines)

Explanation:

  • strip() removes extra spaces.

  • Results in a clean list of headlines for further analysis.


3️⃣ Sentiment Analysis Using TextBlob

from textblob import TextBlob
for news in cleaned_headlines:
analysis = TextBlob(news)
print(f"News: {news}")
print(f"Sentiment Polarity: {analysis.sentiment.polarity}") # -1 negative, +1 positive
print("-" * 50)

Explanation:

  • TextBlob provides polarity (-1 negative, +1 positive) and subjectivity.

  • Allows classification of news sentiment: positive, negative, or neutral.


4️⃣ Keyword Extraction Using RAKE

from rake_nltk import Rake
r = Rake() # Initialize RAKE
for news in cleaned_headlines:
r.extract_keywords_from_text(news)
keywords = r.get_ranked_phrases()
print(f"News: {news}")
print(f"Keywords: {keywords}")
print("-" * 50)

Explanation:

  • RAKE automatically extracts important words/phrases.

  • Useful for identifying main topics or entities in news headlines.


5️⃣ Summarization Using OpenAI GPT

from openai import OpenAI
client = OpenAI(api_key="YOUR_API_KEY")
for news in cleaned_headlines:
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a news summarizer."},
{"role": "user", "content": f"Summarize this news in one sentence: {news}"}
]
)
summary = response.choices[0].message.content
print(f"Original: {news}")
print(f"Summary: {summary}")
print("-" * 50)

Explanation:

  • Uses GPT API to automatically summarize headlines.

  • Can be combined with sentiment and keyword extraction for full news analysis.


6️⃣ Combining Everything in a Pipeline

# Full pipeline: Scrape -> Clean -> Analyze -> Extract Keywords -> Summarize
def scrape_news(url, max_news=10):
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
headlines = [h.text.strip() for h in soup.find_all("h3")[:max_news]]
return headlines
def analyze_news(news_list):
from textblob import TextBlob
from rake_nltk import Rake
from openai import OpenAI
r = Rake()
client = OpenAI(api_key="YOUR_API_KEY")
for news in news_list:
# Sentiment
sentiment = TextBlob(news).sentiment.polarity
# Keywords
r.extract_keywords_from_text(news)
keywords = r.get_ranked_phrases()
# AI Summary
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "Summarize news in one sentence."},
{"role": "user", "content": news}
]
)
summary = response.choices[0].message.content
print(f"News: {news}")
print(f"Sentiment: {sentiment}")
print(f"Keywords: {keywords}")
print(f"Summary: {summary}")
print("-"*70)
# Example usage
url = "https://www.bbc.com/news"
headlines = scrape_news(url)
analyze_news(headlines)

Explanation:

  • A full modular pipeline to scrape, clean, analyze sentiment, extract keywords, and summarize.

  • Can be extended to multiple websites, store results in CSV, or build a news dashboard.

Post a Comment

0 Comments