Spaces:

Sasmitah
/

NewsSummarizationTTS

Sleeping

App Files Files Community

Sasmita Harini commited on Mar 23, 2025

Commit

5b0e184

1 Parent(s): 1c7a511

Run FastAPI as subprocess in app.py

Browse files

Files changed (2) hide show

app.py +6 -4
utils.py +25 -25

app.py CHANGED Viewed

@@ -6,12 +6,12 @@ import tempfile
 import re
 from deep_translator import GoogleTranslator
-st.title("News Summarization and Text-to-Speech Application")
-# User input for company name
 company_name = st.text_input("Enter the company name:", "").strip().lower()
-if st.button("Fetch News"):
     if company_name:
         # Run news extraction and analysis
         st.write(f"Fetching news for **{company_name}**...")
@@ -61,6 +61,7 @@ if st.button("Fetch News"):
                     tts.save(temp_audio_file.name)
                     # Provide download button for the audio
                     with open(temp_audio_file.name, "rb") as audio_file:
                         audio_data = audio_file.read()
@@ -77,4 +78,5 @@ if st.button("Fetch News"):
         else:
             st.error("No relevant news articles found.")
     else:
-        st.warning("Please enter a company name.")

 import re
 from deep_translator import GoogleTranslator
+ st.title("News Summarization and Text-to-Speech Application")
+ # User input for company name
 company_name = st.text_input("Enter the company name:", "").strip().lower()
+ if st.button("Fetch News"):
     if company_name:
         # Run news extraction and analysis
         st.write(f"Fetching news for **{company_name}**...")
                     tts.save(temp_audio_file.name)
                     # Provide download button for the audio
                     with open(temp_audio_file.name, "rb") as audio_file:
                         audio_data = audio_file.read()
         else:
             st.error("No relevant news articles found.")
     else:
+        st.warning("Please enter a company name.")
+requirements.txt

utils.py CHANGED Viewed

@@ -14,18 +14,18 @@ from groq import Groq
 import json
 import re
-nltk.download('vader_lexicon')
-# Initialize sentiment analyzer
 sid = SentimentIntensityAnalyzer()
-# Load models once
 tokenizer = T5Tokenizer.from_pretrained("t5-small")
 model = T5ForConditionalGeneration.from_pretrained("t5-small")
 sentiment_analyzer = pipeline("sentiment-analysis")
 kw_model = KeyBERT()
-# Load spaCy model
 try:
     nlp = spacy.load("en_core_web_md")
 except OSError:
@@ -34,10 +34,10 @@ except OSError:
     spacy.cli.download("en_core_web_md")
     nlp = spacy.load("en_core_web_md")
-# Initialize Groq client
 client = Groq(api_key="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")
-# RSS Feeds
 rss_feeds = [
     # Technology-focused feeds (general tech news, some may cover Visa tech initiatives)
     "https://feeds.bbci.co.uk/news/technology/rss.xml",  # BBC Technology
@@ -56,7 +56,7 @@ rss_feeds = [
     "https://www.pcworld.com/feed",  # PCWorld
     "https://venturebeat.com/feed/",  # VentureBeat
-    # Business and Finance feeds (more likely to cover Visa)
     "https://feeds.bbci.co.uk/news/business/rss.xml",  # BBC Business
     "https://www.cnbc.com/id/10001147/device/rss/rss.html",  # CNBC Business
     "https://www.economist.com/business/rss.xml",  # The Economist Business
@@ -71,7 +71,7 @@ rss_feeds = [
     "https://www.marketwatch.com/rss/topstories",  # MarketWatch Top Stories
     "https://www.investing.com/rss/news.rss",  # Investing.com News
-    # General news (reliable sources that may cover Visa)
     "https://feeds.bbci.co.uk/news/rss.xml",  # BBC News
     "https://www.aljazeera.com/xml/rss/all.xml",  # Al Jazeera
     "https://www.theguardian.com/world/rss",  # The Guardian World
@@ -81,16 +81,16 @@ rss_feeds = [
     "https://feeds.washingtonpost.com/rss/business",  # Washington Post Business
 ]
-headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
 }
-# Locks for thread safety
 model_lock = threading.Lock()
 sentiment_lock = threading.Lock()
 keyword_lock = threading.Lock()
-def summarize_t5(text, max_length=100, min_length=30):
     with model_lock:
         inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
         summary_ids = model.generate(
@@ -103,17 +103,17 @@ def summarize_t5(text, max_length=100, min_length=30):
         )
         return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-def analyze_sentiment(text):
     with sentiment_lock:
         result = sentiment_analyzer(text[:512])[0]
         label = result["label"].lower()
         return "Positive" if label == "positive" else "Negative" if label == "negative" else "Neutral"
-def extract_keywords(text):
     with keyword_lock:
         return ", ".join([kw[0] for kw in kw_model.extract_keywords(text, top_n=5)])
-def process_article_content(article_data):
     try:
         title, link, content, company_name = article_data
         with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
@@ -134,7 +134,7 @@ def process_article_content(article_data):
         print(f"❌ Error processing article {title}: {e}")
         return None
-def fetch_article_content(article_info, company_name, article_limit_reached):
     title, link, description = article_info
     try:
         if article_limit_reached.is_set():
@@ -151,7 +151,7 @@ def fetch_article_content(article_info, company_name, article_limit_reached):
         print(f"❌ Failed to retrieve content for: {title} - {e}")
     return None
-def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_reached):
     try:
         if article_limit_reached.is_set():
             return
@@ -175,7 +175,7 @@ def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_
     except requests.RequestException as e:
         print(f"❌ Failed to fetch RSS feed: {rss_url} - {e}")
-def get_coverage_differences(articles, company_name):
     """Fetch coverage differences using Groq API."""
     articles_summary = "\n".join([f"Article {i+1}: Title: {a['title']}, Summary: {a['summary']}, Sentiment: {a['sentiment']}, Keywords: {a['keywords']}"
                                  for i, a in enumerate(articles)])
@@ -185,10 +185,10 @@ def get_coverage_differences(articles, company_name):
     2. Identify coverage differences between positive and negative articles.
     3. Provide insights into how these differences impact {company_name}'s market, mentioning article numbers clearly.
-    Articles:
     {articles_summary}
-    Generate a JSON output in the following format:
     {{
       "Coverage Differences": [
         {{
@@ -213,7 +213,7 @@ def get_coverage_differences(articles, company_name):
             coverage_diff += chunk.choices[0].delta.content or ""
         text = coverage_diff.strip()
-        pattern = r'json\s*([\s\S]*?)\s*'
         match = re.search(pattern, text)
         if match:
@@ -225,11 +225,11 @@ def get_coverage_differences(articles, company_name):
             except json.JSONDecodeError as e:
                 return f"Error: Invalid JSON format - {str(e)}"
         else:
-            return "Error: No JSON content found between json and markers"
     except Exception as e:
         return f"Error in Groq API call: {str(e)}"
-def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
     keyword_clusters = defaultdict(list)
     for article in processed_articles:
         keywords = article["keywords"].split(", ")
@@ -269,7 +269,7 @@ def similarity_based_common_topics(processed_articles, similarity_threshold=0.8,
             final_common_topics.append(topic)
     return final_common_topics
-def comparative_analysis(processed_articles, company_name):
     sentiment_summary = {"Positive": 0, "Negative": 0, "Neutral": 0}
     all_keywords = []
     for idx, article in enumerate(processed_articles):
@@ -314,7 +314,7 @@ def comparative_analysis(processed_articles, company_name):
         "Final Sentiment Analysis": sentiment_statement
     }
-def fetch_and_save_news(company_name):
     if not company_name:
         print("❌ Error: Company name is required")
         return None
@@ -404,6 +404,6 @@ def fetch_and_save_news(company_name):
     print("✅ File saved successfully!")
     return file_name
-if __name__ == "__main__":
     company_name = input("Enter company name to search for (e.g., Tesla): ")
     fetch_and_save_news(company_name)

 import json
 import re
+ nltk.download('vader_lexicon')
+ # Initialize sentiment analyzer
 sid = SentimentIntensityAnalyzer()
+ # Load models once
 tokenizer = T5Tokenizer.from_pretrained("t5-small")
 model = T5ForConditionalGeneration.from_pretrained("t5-small")
 sentiment_analyzer = pipeline("sentiment-analysis")
 kw_model = KeyBERT()
+ # Load spaCy model
 try:
     nlp = spacy.load("en_core_web_md")
 except OSError:
     spacy.cli.download("en_core_web_md")
     nlp = spacy.load("en_core_web_md")
+ # Initialize Groq client
 client = Groq(api_key="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")
+ # RSS Feeds
 rss_feeds = [
     # Technology-focused feeds (general tech news, some may cover Visa tech initiatives)
     "https://feeds.bbci.co.uk/news/technology/rss.xml",  # BBC Technology
     "https://www.pcworld.com/feed",  # PCWorld
     "https://venturebeat.com/feed/",  # VentureBeat
+     # Business and Finance feeds (more likely to cover Visa)
     "https://feeds.bbci.co.uk/news/business/rss.xml",  # BBC Business
     "https://www.cnbc.com/id/10001147/device/rss/rss.html",  # CNBC Business
     "https://www.economist.com/business/rss.xml",  # The Economist Business
     "https://www.marketwatch.com/rss/topstories",  # MarketWatch Top Stories
     "https://www.investing.com/rss/news.rss",  # Investing.com News
+     # General news (reliable sources that may cover Visa)
     "https://feeds.bbci.co.uk/news/rss.xml",  # BBC News
     "https://www.aljazeera.com/xml/rss/all.xml",  # Al Jazeera
     "https://www.theguardian.com/world/rss",  # The Guardian World
     "https://feeds.washingtonpost.com/rss/business",  # Washington Post Business
 ]
+ headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
 }
+ # Locks for thread safety
 model_lock = threading.Lock()
 sentiment_lock = threading.Lock()
 keyword_lock = threading.Lock()
+ def summarize_t5(text, max_length=100, min_length=30):
     with model_lock:
         inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
         summary_ids = model.generate(
         )
         return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+ def analyze_sentiment(text):
     with sentiment_lock:
         result = sentiment_analyzer(text[:512])[0]
         label = result["label"].lower()
         return "Positive" if label == "positive" else "Negative" if label == "negative" else "Neutral"
+ def extract_keywords(text):
     with keyword_lock:
         return ", ".join([kw[0] for kw in kw_model.extract_keywords(text, top_n=5)])
+ def process_article_content(article_data):
     try:
         title, link, content, company_name = article_data
         with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
         print(f"❌ Error processing article {title}: {e}")
         return None
+ def fetch_article_content(article_info, company_name, article_limit_reached):
     title, link, description = article_info
     try:
         if article_limit_reached.is_set():
         print(f"❌ Failed to retrieve content for: {title} - {e}")
     return None
+ def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_reached):
     try:
         if article_limit_reached.is_set():
             return
     except requests.RequestException as e:
         print(f"❌ Failed to fetch RSS feed: {rss_url} - {e}")
+ def get_coverage_differences(articles, company_name):
     """Fetch coverage differences using Groq API."""
     articles_summary = "\n".join([f"Article {i+1}: Title: {a['title']}, Summary: {a['summary']}, Sentiment: {a['sentiment']}, Keywords: {a['keywords']}"
                                  for i, a in enumerate(articles)])
     2. Identify coverage differences between positive and negative articles.
     3. Provide insights into how these differences impact {company_name}'s market, mentioning article numbers clearly.
+     Articles:
     {articles_summary}
+     Generate a JSON output in the following format:
     {{
       "Coverage Differences": [
         {{
             coverage_diff += chunk.choices[0].delta.content or ""
         text = coverage_diff.strip()
+        pattern = r'```json\s*([\s\S]*?)\s*```'
         match = re.search(pattern, text)
         if match:
             except json.JSONDecodeError as e:
                 return f"Error: Invalid JSON format - {str(e)}"
         else:
+            return "Error: No JSON content found between ```json and ``` markers"
     except Exception as e:
         return f"Error in Groq API call: {str(e)}"
+ def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
     keyword_clusters = defaultdict(list)
     for article in processed_articles:
         keywords = article["keywords"].split(", ")
             final_common_topics.append(topic)
     return final_common_topics
+ def comparative_analysis(processed_articles, company_name):
     sentiment_summary = {"Positive": 0, "Negative": 0, "Neutral": 0}
     all_keywords = []
     for idx, article in enumerate(processed_articles):
         "Final Sentiment Analysis": sentiment_statement
     }
+ def fetch_and_save_news(company_name):
     if not company_name:
         print("❌ Error: Company name is required")
         return None
     print("✅ File saved successfully!")
     return file_name
+ if __name__ == "__main__":
     company_name = input("Enter company name to search for (e.g., Tesla): ")
     fetch_and_save_news(company_name)