Theme_Analyze-2 / app.py
Sooteemon's picture
Update app.py
cc35bd9 verified
import streamlit as st
import requests
from transformers import pipeline
import pandas as pd
from datetime import datetime, timedelta, timezone
import calendar
# --- Imports for Yahoo Scraper ---
import feedparser
from urllib.parse import quote
from bs4 import BeautifulSoup
# --- 1. การตั้งค่าหน้าและ API Key ---
st.set_page_config(page_title="📰 News Theme Analysis", layout="wide")
API_KEY = "88bc396d4eab4be494a4b86ec842db47"
NEWS_API_URL = "https://newsapi.org/v2/everything"
# --- 2. โหลดโมเดล (อัปเกรดแล้ว) ---
@st.cache_resource
def load_model():
try:
print("Loading model: MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
print("Model loaded successfully!")
return classifier
except Exception as e:
print(f"Error loading model: {e}")
return None
classifier = load_model()
theme_labels = ["Earnings & Finance", "Product & Innovation", "Legal & Regulation", "Partnership & Deals", "Stock Price Movement", "Executive Changes"]
# --- 3. ฟังก์ชันดึงข่าว (วิธีที่ 1: NewsAPI) ---
@st.cache_data(ttl=3600)
def fetch_news_from_api(topic, api_key):
"""
ดึงข่าวโดยใช้ NewsAPI.org (Reuters, Bloomberg...)
"""
headers = {'X-Api-Key': api_key}
from_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
trusted_sources = "reuters,bloomberg,the-wall-street-journal,marketwatch,cnbc"
params = {
'q': topic,
'sortBy': 'publishedAt',
'pageSize': 20,
'from': from_date,
'sources': trusted_sources
}
try:
response = requests.get(NEWS_API_URL, headers=headers, params=params)
response.raise_for_status()
articles = response.json().get('articles', [])
news_list = []
for article in articles:
if article.get('title') == "[Removed]":
continue
news_list.append({
'title': article.get('title'),
'summary': article.get('description', 'No description available.'),
'link': article.get('url'),
'published': article.get('publishedAt')
})
return news_list
except Exception as e:
st.error(f"NewsAPI Error: {e}")
return []
# --- 4. ฟังก์ชันดึงข่าว (วิธีที่ 2: Yahoo Finance RSS) ---
def _parse_feed(url, max_articles=10):
"""
Helper function for Yahoo Scraper
"""
news_list = []
try:
cutoff_date_utc = datetime.now(timezone.utc) - timedelta(days=7)
feed = feedparser.parse(url, agent='Mozilla/5.0')
for entry in feed.entries:
published_struct = entry.get('published_parsed')
if not published_struct:
continue
try:
article_timestamp = calendar.timegm(published_struct)
article_date_utc = datetime.fromtimestamp(article_timestamp, tz=timezone.utc)
except Exception:
continue
if article_date_utc >= cutoff_date_utc:
raw_summary_html = entry.get('summary', '')
soup = BeautifulSoup(raw_summary_html, 'html.parser')
clean_summary = soup.get_text()
news_list.append({
'title': entry.get('title', 'No title'),
'link': entry.get('link', '').split('&url=')[-1],
'summary': clean_summary[:300],
'published': article_date_utc.isoformat()
})
if len(news_list) >= max_articles:
break
except Exception as e:
print(f"Feedparser Error: {e}")
return news_list
@st.cache_data(ttl=3600)
def search_yahoo_news(keyword, max_articles=10):
"""
ดึงข่าวจาก Yahoo Finance (ผ่าน Google RSS)
"""
try:
safe_keyword = quote(keyword)
url = f"https://news.google.com/rss/search?q={safe_keyword}+site:finance.yahoo.com&hl=en-US&gl=US&ceid=US:en"
return _parse_feed(url, max_articles)
except Exception as e:
st.error(f"Yahoo Search Error: {e}")
return []
# --- 5. ฟังก์ชันวิเคราะห์ ---
def analyze_themes(news_list, model, labels):
results = []
if not model:
st.error("Model failed to load. Cannot analyze themes.")
return pd.DataFrame()
for news in news_list:
title = news.get('title', '')
if not title:
continue
try:
analysis = model(title, candidate_labels=labels)
news['theme'] = analysis['labels'][0]
news['confidence'] = analysis['scores'][0]
results.append(news)
except Exception as e:
print(f"Error analyzing title '{title}': {e}")
return pd.DataFrame(results)
# --- 6. UI ---
st.title("📰 News Theme Analyzer")
st.markdown("Analyzes news from trusted sources (Reuters, Bloomberg, Yahoo...)")
topic = st.text_input("Enter a keyword or stock symbol:", "NVIDIA")
if st.button("Analyze News"):
if not topic:
st.warning("Please enter a topic.")
elif not classifier:
st.error("Model is not available. Please check logs.")
else:
with st.spinner(f"Fetching and analyzing news for '{topic}'..."):
news_api = fetch_news_from_api(topic, API_KEY)
news_yahoo = search_yahoo_news(topic)
all_news = news_api + news_yahoo
seen_titles = set()
unique_news_items = []
for item in all_news:
if item['title'] not in seen_titles:
unique_news_items.append(item)
seen_titles.add(item['title'])
if unique_news_items:
df = analyze_themes(unique_news_items, classifier, theme_labels)
if df.empty:
st.warning("Analysis failed, though news was fetched. Check logs.")
else:
st.subheader("📊 Theme Distribution Summary")
theme_counts = df['theme'].value_counts()
total_articles = len(df)
summary_data = []
for theme, count in theme_counts.items():
percentage = (count / total_articles) * 100
summary_data.append({
"Theme": theme,
"Percentage": f"{percentage:.1f}%",
"Articles": f"{count} / {total_articles}"
})
st.dataframe(pd.DataFrame(summary_data), use_container_width=True)
st.markdown("---")
st.subheader(f"📰 Analysis Results (Found {len(df)} unique articles)")
for index, row in df.iterrows():
st.markdown(f"### [{row['title']}]({row['link']})")
try:
date_str = pd.to_datetime(row['published']).strftime("%Y-%m-%d %H:%M")
except:
date_str = "N/A"
st.markdown(f"**Theme:** {row['theme']} (Confidence: {row['confidence']:.2f}) \n**Published:** {date_str}")
st.markdown(f"**Summary:** {row['summary']}")
st.markdown("---")
else:
st.error("No news found from any source.")