Spaces:

Sasmitah
/

NewsSummarizationTTS1

Sleeping

App Files Files Community

Sasmitah commited on Mar 23, 2025

Commit

c7c2ef5

verified ·

1 Parent(s): eb79ec9

Upload 4 files

Browse files

Files changed (4) hide show

README.md +14 -8
app.py +80 -0
requirements.txt +20 -0
utils.py +409 -0

README.md CHANGED Viewed

@@ -1,14 +1,20 @@
 ---
-title: NewsSummarizationTTS1
-emoji: 🐨
-colorFrom: pink
-colorTo: red
 sdk: streamlit
-sdk_version: 1.43.2
 app_file: app.py
 pinned: false
-license: apache-2.0
-short_description: NewsSummarizationTTS
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: News Summarization App
+emoji: 📰
+colorFrom: blue
+colorTo: green
 sdk: streamlit
+sdk_version: 1.36.0
 app_file: app.py
 pinned: false
 ---
+# News Summarization App
+This app fetches news articles about a company, summarizes them, analyzes sentiment, and provides a Hindi audio translation of the sentiment analysis.
+## Requirements
+- See `requirements.txt` for dependencies.
+## Backend
+- The FastAPI backend (`api.py`) runs alongside the Streamlit app to handle news fetching and processing.

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import streamlit as st
+import utils  # Import functions from utils.py
+import os
+from gtts import gTTS
+import tempfile
+import re
+from deep_translator import GoogleTranslator
+st.title("News Summarization and Text-to-Speech Application")
+# User input for company name
+company_name = st.text_input("Enter the company name:", "").strip().lower()
+if st.button("Fetch News"):
+    if company_name:
+        # Run news extraction and analysis
+        st.write(f"Fetching news for **{company_name}**...")
+        # Call the function from utils.py
+        file_name = utils.fetch_and_save_news(company_name)
+        if os.path.exists(file_name):
+            st.success(f"Data saved in **{file_name}**")
+            # Read the file to display content
+            with open(file_name, "r", encoding="utf-8") as file:
+                text_content = file.read()
+                st.text_area("News Analysis", text_content, height=400)
+            # Provide a download button for text file
+            with open(file_name, "rb") as file:
+                st.download_button(
+                    label="Download Text File",
+                    data=file,
+                    file_name=file_name,
+                    mime="text/plain"
+                )
+            # Extract only the Final Sentiment Analysis line
+            final_sentiment_line = ""
+            with open(file_name, "r", encoding="utf-8") as file:
+                content = file.read()
+                # Use regular expression to find the Final Sentiment Analysis line
+                match = re.search(r'"Final Sentiment Analysis": "([^"]+)"', content)
+                if match:
+                    final_sentiment_line = match.group(1)
+            if final_sentiment_line:
+                st.subheader("Hindi Audio for Final Sentiment Analysis")
+                try:
+                    # First translate the English text to Hindi using deep_translator
+                    translator = GoogleTranslator(source='en', target='hi')
+                    hindi_text = translator.translate(final_sentiment_line)
+                    # Create Hindi audio from the translated text
+                    tts = gTTS(text=hindi_text, lang='hi', slow=False)
+                    # Save the audio in a temporary file
+                    temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+                    tts.save(temp_audio_file.name)
+                    # Provide download button for the audio
+                    with open(temp_audio_file.name, "rb") as audio_file:
+                        audio_data = audio_file.read()
+                        st.download_button(
+                            label="Download Hindi Audio",
+                            data=audio_data,
+                            file_name=f"{company_name}_sentiment_hindi.mp3",
+                            mime="audio/mp3"
+                        )
+                except Exception as e:
+                    st.error(f"Error generating Hindi audio: {str(e)}")
+            else:
+                st.warning("Could not find Final Sentiment Analysis in the text.")
+        else:
+            st.error("No relevant news articles found.")
+    else:
+        st.warning("Please enter a company name.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+requests==2.32.3
+beautifulsoup4==4.13.3
+transformers==4.49.0
+torch==2.2.2
+keybert==0.9.0
+spacy==3.8.3
+nltk==3.9.1
+groq==0.18.0
+httpx==0.23.0
+sentencepiece==0.2.0
+streamlit==1.43.2
+fastapi==0.115.11
+pydantic==2.10.6
+uvicorn==0.34.0
+deep-translator==1.11.4
+gtts==2.5.4
+scikit-learn==1.6.1
+protobuf==3.20.3
+sentence-transformers==3.4.1

utils.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import requests
+from bs4 import BeautifulSoup
+import time
+import concurrent.futures
+import threading
+from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
+from keybert import KeyBERT
+import queue
+from collections import defaultdict
+import spacy
+import nltk
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
+from groq import Groq
+import json
+import re
+nltk.download('vader_lexicon')
+# Initialize sentiment analyzer
+sid = SentimentIntensityAnalyzer()
+# Load models once
+tokenizer = T5Tokenizer.from_pretrained("t5-small")
+model = T5ForConditionalGeneration.from_pretrained("t5-small")
+sentiment_analyzer = pipeline("sentiment-analysis")
+kw_model = KeyBERT()
+# Load spaCy model
+try:
+    nlp = spacy.load("en_core_web_md")
+except OSError:
+    print("Downloading 'en_core_web_md' model...")
+    import spacy.cli
+    spacy.cli.download("en_core_web_md")
+    nlp = spacy.load("en_core_web_md")
+# Initialize Groq client
+client = Groq(api_key="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")
+# RSS Feeds
+rss_feeds = [
+    # Technology-focused feeds (general tech news, some may cover Visa tech initiatives)
+    "https://feeds.bbci.co.uk/news/technology/rss.xml",  # BBC Technology
+    "https://www.cnbc.com/id/19854910/device/rss/rss.html",  # CNBC Tech
+    "https://www.theverge.com/rss/index.xml",  # The Verge
+    "https://feeds.arstechnica.com/arstechnica/index",  # Ars Technica
+    "https://www.engadget.com/rss.xml",  # Engadget
+    "https://techcrunch.com/feed/",  # TechCrunch
+    "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",  # NYT Technology
+    "https://www.wired.com/feed/rss",  # Wired
+    "https://www.zdnet.com/news/rss.xml",  # ZDNet News
+    "https://www.cnet.com/rss/news/",  # CNET News
+    "https://www.digitaltrends.com/feed/",  # Digital Trends
+    "https://www.techmeme.com/feed.xml",  # Techmeme
+    "https://www.technologyreview.com/feed/",  # MIT Technology Review
+    "https://www.pcworld.com/feed",  # PCWorld
+    "https://venturebeat.com/feed/",  # VentureBeat
+    # Business and Finance feeds (more likely to cover Visa)
+    "https://feeds.bbci.co.uk/news/business/rss.xml",  # BBC Business
+    "https://www.cnbc.com/id/10001147/device/rss/rss.html",  # CNBC Business
+    "https://www.economist.com/business/rss.xml",  # The Economist Business
+    "https://www.ft.com/companies/financials/rss",  # Financial Times Financials (Visa-relevant)
+    "https://www.ft.com/rss/companies/technology",  # Financial Times Tech Companies
+    "https://feeds.a.dj.com/rss/WSJcomUSBusiness.xml",  # Wall Street Journal US Business
+    "https://www.forbes.com/money/feed/",  # Forbes Money
+    "https://www.reuters.com/arc/outboundfeeds/business/?outputType=xml",  # Reuters Business
+    "https://www.bloomberg.com/feed/podcasts/markets.xml",  # Bloomberg Markets
+    "https://finance.yahoo.com/news/rssindex",  # Yahoo Finance News
+    "https://www.nasdaq.com/feed/rssoutbound",  # Nasdaq News
+    "https://www.marketwatch.com/rss/topstories",  # MarketWatch Top Stories
+    "https://www.investing.com/rss/news.rss",  # Investing.com News
+    # General news (reliable sources that may cover Visa)
+    "https://feeds.bbci.co.uk/news/rss.xml",  # BBC News
+    "https://www.aljazeera.com/xml/rss/all.xml",  # Al Jazeera
+    "https://www.theguardian.com/world/rss",  # The Guardian World
+    "https://feeds.npr.org/1001/rss.xml",  # NPR News
+    "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",  # NYT Home Page
+    "https://apnews.com/hub/business?format=rss",  # Associated Press Business
+    "https://feeds.washingtonpost.com/rss/business",  # Washington Post Business
+]
+headers = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
+}
+# Locks for thread safety
+model_lock = threading.Lock()
+sentiment_lock = threading.Lock()
+keyword_lock = threading.Lock()
+def summarize_t5(text, max_length=100, min_length=30):
+    with model_lock:
+        inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
+        summary_ids = model.generate(
+            inputs.input_ids,
+            max_length=max_length,
+            min_length=min_length,
+            length_penalty=2.0,
+            num_beams=4,
+            early_stopping=True
+        )
+        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+def analyze_sentiment(text):
+    with sentiment_lock:
+        result = sentiment_analyzer(text[:512])[0]
+        label = result["label"].lower()
+        return "Positive" if label == "positive" else "Negative" if label == "negative" else "Neutral"
+def extract_keywords(text):
+    with keyword_lock:
+        return ", ".join([kw[0] for kw in kw_model.extract_keywords(text, top_n=5)])
+def process_article_content(article_data):
+    try:
+        title, link, content, company_name = article_data
+        with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
+            summary_future = executor.submit(summarize_t5, content)
+            sentiment_future = executor.submit(analyze_sentiment, content)
+            keywords_future = executor.submit(extract_keywords, content)
+            summary_text = summary_future.result()
+            sentiment = sentiment_future.result()
+            keywords = keywords_future.result()
+        return {
+            "title": title,
+            "link": link,
+            "summary": summary_text,
+            "sentiment": sentiment,
+            "keywords": keywords
+        }
+    except Exception as e:
+        print(f"❌ Error processing article {title}: {e}")
+        return None
+def fetch_article_content(article_info, company_name, article_limit_reached):
+    title, link, description = article_info
+    try:
+        if article_limit_reached.is_set():
+            return None
+        if company_name.lower() in title.lower() or (description and company_name.lower() in description.lower()):
+            article_response = requests.get(link, headers=headers, timeout=10)
+            article_response.raise_for_status()
+            article_soup = BeautifulSoup(article_response.content, "html.parser")
+            content = "\n".join(p.text for p in article_soup.find_all("p"))
+            if company_name.lower() in title.lower() or company_name.lower() in content.lower():
+                print(f"✅ Found article: {title}")
+                return (title, link, content, company_name)
+    except requests.RequestException as e:
+        print(f"❌ Failed to retrieve content for: {title} - {e}")
+    return None
+def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_reached):
+    try:
+        if article_limit_reached.is_set():
+            return
+        response = requests.get(rss_url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, "xml")
+        articles = soup.find_all("item")
+        article_infos = [(article.title.text if article.title else "",
+                          article.link.text if article.link else "",
+                          article.description.text if article.description else "")
+                         for article in articles if article.title and article.link]
+        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+            futures = [executor.submit(fetch_article_content, info, company_name, article_limit_reached)
+                       for info in article_infos]
+            for future in concurrent.futures.as_completed(futures):
+                if article_limit_reached.is_set():
+                    return
+                result = future.result()
+                if result:
+                    article_queue.put(result)
+    except requests.RequestException as e:
+        print(f"❌ Failed to fetch RSS feed: {rss_url} - {e}")
+def get_coverage_differences(articles, company_name):
+    """Fetch coverage differences using Groq API."""
+    articles_summary = "\n".join([f"Article {i+1}: Title: {a['title']}, Summary: {a['summary']}, Sentiment: {a['sentiment']}, Keywords: {a['keywords']}"
+                                 for i, a in enumerate(articles)])
+    prompt = f"""
+    Analyze the following ten articles about {company_name} and generate a comparative coverage analysis:
+    1. Compare articles based on their main topics.
+    2. Identify coverage differences between positive and negative articles.
+    3. Provide insights into how these differences impact {company_name}'s market, mentioning article numbers clearly.
+    Articles:
+    {articles_summary}
+    Generate a JSON output in the following format:
+    {{
+      "Coverage Differences": [
+        {{
+          "Comparison": "Summary of key differences between two articles.",
+          "Impact": "Explanation of how these differences affect {company_name}'s market perception."
+        }}
+      ]
+    }}
+    """
+    try:
+        completion = client.chat.completions.create(
+            model="llama-3.3-70b-versatile",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=1,
+            max_completion_tokens=1024,
+            top_p=1,
+            stream=True,
+            stop=None,
+        )
+        coverage_diff = ""
+        for chunk in completion:
+            coverage_diff += chunk.choices[0].delta.content or ""
+        text = coverage_diff.strip()
+        pattern = r'json\s*([\s\S]*?)\s*'
+        match = re.search(pattern, text)
+        if match:
+            json_str = match.group(1)
+            try:
+                json_dict = json.loads(json_str)
+                json_dict = json.dumps(json_dict, indent=4)
+                return json_dict
+            except json.JSONDecodeError as e:
+                return f"Error: Invalid JSON format - {str(e)}"
+        else:
+            return "Error: No JSON content found between json and markers"
+    except Exception as e:
+        return f"Error in Groq API call: {str(e)}"
+def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
+    keyword_clusters = defaultdict(list)
+    for article in processed_articles:
+        keywords = article["keywords"].split(", ")
+        for keyword in keywords:
+            if not nlp(keyword).has_vector:
+                continue
+            added = False
+            for cluster_key in list(keyword_clusters.keys()):
+                if nlp(keyword).similarity(nlp(cluster_key)) >= similarity_threshold:
+                    keyword_clusters[cluster_key].append(keyword)
+                    added = True
+                    break
+            if not added:
+                keyword_clusters[keyword].append(keyword)
+    deduplicated_clusters = {min(keywords, key=len): keywords for cluster_key, keywords in keyword_clusters.items()}
+    common_topics = []
+    article_keyword_sets = [set(a["keywords"].split(", ")) for a in processed_articles]
+    for representative, cluster in deduplicated_clusters.items():
+        articles_with_cluster = sum(1 for keyword_set in article_keyword_sets
+                                   if any(kw in keyword_set for kw in cluster))
+        if articles_with_cluster >= min_articles:
+            common_topics.append(representative)
+    final_common_topics = []
+    for topic in common_topics:
+        if not nlp(topic).has_vector:
+            final_common_topics.append(topic)
+            continue
+        is_similar = False
+        for added_topic in list(final_common_topics):
+            if nlp(topic).similarity(nlp(added_topic)) >= similarity_threshold:
+                is_similar = True
+                if len(topic) < len(added_topic):
+                    final_common_topics.remove(added_topic)
+                    final_common_topics.append(topic)
+                break
+        if not is_similar:
+            final_common_topics.append(topic)
+    return final_common_topics
+def comparative_analysis(processed_articles, company_name):
+    sentiment_summary = {"Positive": 0, "Negative": 0, "Neutral": 0}
+    all_keywords = []
+    for idx, article in enumerate(processed_articles):
+        sentiment_summary[article["sentiment"]] += 1
+        keywords = set(article["keywords"].split(", "))
+        all_keywords.append((idx, keywords))
+    common_topics = similarity_based_common_topics(processed_articles)
+    unique_topics = {}
+    for idx, topics in all_keywords:
+        unique = topics - set(common_topics)
+        deduplicated_unique = set()
+        for topic in unique:
+            if not nlp(topic).has_vector:
+                deduplicated_unique.add(topic)
+                continue
+            is_similar = False
+            for added_topic in list(deduplicated_unique):
+                if nlp(topic).similarity(nlp(added_topic)) >= 0.8:
+                    is_similar = True
+                    if len(topic) < len(added_topic):
+                        deduplicated_unique.remove(added_topic)
+                        deduplicated_unique.add(topic)
+                    break
+            if not is_similar:
+                deduplicated_unique.add(topic)
+        unique_topics[f"Unique Topics in Article {idx+1}"] = deduplicated_unique
+    final_sentiment = max(sentiment_summary, key=sentiment_summary.get)
+    # Add stock growth expectation based on sentiment
+    if final_sentiment == "Positive":
+        sentiment_statement = (f"{company_name}’s latest news coverage is mostly {final_sentiment.lower()}. "
+                              f"This positive sentiment suggests potential stock growth as investor confidence may increase.")
+    elif final_sentiment == "Negative":
+        sentiment_statement = (f"{company_name}’s latest news coverage is mostly {final_sentiment.lower()}. "
+                              f"This negative sentiment suggests potential stock decline as investor confidence may weaken.")
+    else:  # Neutral
+        sentiment_statement = (f"{company_name}’s latest news coverage is mostly {final_sentiment.lower()}. "
+                              f"This neutral sentiment suggests limited immediate impact on stock value, with potential for stability unless new developments shift perceptions.")
+    return {
+        "Topic Overlap": {"Common Topics": common_topics, **unique_topics},
+        "Final Sentiment Analysis": sentiment_statement
+    }
+def fetch_and_save_news(company_name):
+    if not company_name:
+        print("❌ Error: Company name is required")
+        return None
+    file_name = f"{company_name}_news.txt"
+    articles = []
+    article_count = 0
+    article_limit = 10
+    print(f"🚀 Starting parallel fetching for company: {company_name}...")
+    article_queue = queue.Queue()
+    article_limit_reached = threading.Event()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as fetch_executor:
+        fetch_futures = [fetch_executor.submit(fetch_articles_from_rss, url, company_name, article_queue, article_limit_reached)
+                         for url in rss_feeds]
+        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as process_executor:
+            processing_futures = []
+            while article_count < article_limit and (not article_queue.empty() or not all(f.done() for f in fetch_futures)):
+                try:
+                    article_data = article_queue.get(timeout=0.1)
+                    future = process_executor.submit(process_article_content, article_data)
+                    processing_futures.append(future)
+                except queue.Empty:
+                    continue
+            for future in concurrent.futures.as_completed(processing_futures):
+                if article_count >= article_limit:
+                    article_limit_reached.set()
+                    break
+                result = future.result()
+                if result:
+                    articles.append(result)
+                    article_count += 1
+                    print(f"📊 Processed {article_count}/{article_limit} articles")
+                    if article_count >= article_limit:
+                        article_limit_reached.set()
+                        print(f"✅ Reached article limit of {article_limit}. Stopping search.")
+                        break
+    articles = articles[:article_limit]
+    if not articles:
+        print(f"❌ No relevant articles found for company: {company_name}")
+        return None
+    print(f"✅ Saving {len(articles)} articles to {file_name}")
+    analysis_result = comparative_analysis(articles, company_name)
+    coverage_differences = get_coverage_differences(articles, company_name)
+    sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
+    for article in articles:
+        sentiment_distribution[article["sentiment"]] += 1
+    formatted_articles = [{"Title": article["title"], "Summary": article["summary"],
+                           "Sentiment": article["sentiment"], "Topics": article["keywords"].split(", ")}
+                          for article in articles]
+    output_data = {
+        "Company": company_name,
+        "Articles": formatted_articles,
+        "Comparative Sentiment Score": {"Sentiment Distribution": sentiment_distribution},
+        "Coverage Differences": coverage_differences,
+        "Topic Overlap": {
+            "Common Topics": analysis_result['Topic Overlap']['Common Topics'],
+            **{k: list(v) for k, v in analysis_result['Topic Overlap'].items() if k != "Common Topics"}
+        },
+        "Final Sentiment Analysis": analysis_result['Final Sentiment Analysis']
+    }
+    with open(file_name, "w", encoding="utf-8") as file:
+        file.write(f'"Company": "{output_data["Company"]}",\n')
+        file.write('"Articles": [\n')
+        for i, article in enumerate(output_data["Articles"]):
+            file.write('{\n')
+            file.write(f'"Title": "{article["Title"]}",\n')
+            file.write(f'"Summary": "{article["Summary"]}",\n')
+            file.write(f'"Sentiment": "{article["Sentiment"]}",\n')
+            file.write(f'"Topics": {article["Topics"]}\n')
+            file.write('}' + (',\n' if i < len(output_data["Articles"]) - 1 else '\n'))
+        file.write('],\n')
+        file.write('"Comparative Sentiment Score": {\n')
+        file.write('"Sentiment Distribution": {\n')
+        for i, (sentiment, count) in enumerate(output_data["Comparative Sentiment Score"]["Sentiment Distribution"].items()):
+            file.write(f'"{sentiment}": {count}' + (',' if i < 2 else '') + '\n')
+        file.write('}\n')
+        file.write('},\n')
+        file.write(f'{output_data["Coverage Differences"]},\n')
+        file.write('"Topic Overlap": {\n')
+        file.write(f'"Common Topics": {output_data["Topic Overlap"]["Common Topics"]},\n')
+        for i, (key, value) in enumerate([(k, v) for k, v in output_data["Topic Overlap"].items() if k != "Common Topics"]):
+            file.write(f'"{key}": {value}' + (',\n' if i < len(output_data["Topic Overlap"]) - 2 else '\n'))
+        file.write('},\n')
+        file.write(f'"Final Sentiment Analysis": "{output_data["Final Sentiment Analysis"]}"\n')
+    print("\nOutput format:")
+    with open(file_name, "r", encoding="utf-8") as file:
+        print(file.read())
+    print("✅ File saved successfully!")
+    return file_name
+if __name__ == "__main__":
+    company_name = input("Enter company name to search for (e.g., Tesla): ")
+    fetch_and_save_news(company_name)