Spaces:

Sasmitah
/

NewsSummarizationTTS

Sleeping

App Files Files Community

Sasmita Harini commited on Mar 23, 2025

Commit

3bae43a

1 Parent(s): 440ee04

Run FastAPI as subprocess in app.py

Browse files

Files changed (1) hide show

utils.py +65 -75

utils.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# utils.py
 import requests
 from bs4 import BeautifulSoup
 import time
@@ -64,10 +62,10 @@ rss_feeds = [
     "https://www.economist.com/business/rss.xml",  # The Economist Business
     "https://www.ft.com/companies/financials/rss",  # Financial Times Financials (Visa-relevant)
     "https://www.ft.com/rss/companies/technology",  # Financial Times Tech Companies
-    "https://feeds.a.dj.com/rss/WSJcomUSBusiness.xml",  # Wall Street Journal US Business (updated URL)
-    "https://www.forbes.com/money/feed/",  # Forbes Money (updated URL)
-    "https://www.reuters.com/arc/outboundfeeds/business/?outputType=xml",  # Reuters Business (updated URL)
-    "https://www.bloomberg.com/feed/podcasts/markets.xml",  # Bloomberg Markets (updated URL)
     "https://finance.yahoo.com/news/rssindex",  # Yahoo Finance News
     "https://www.nasdaq.com/feed/rssoutbound",  # Nasdaq News
     "https://www.marketwatch.com/rss/topstories",  # MarketWatch Top Stories
@@ -79,11 +77,10 @@ rss_feeds = [
     "https://www.theguardian.com/world/rss",  # The Guardian World
     "https://feeds.npr.org/1001/rss.xml",  # NPR News
     "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",  # NYT Home Page
-    "https://apnews.com/hub/business?format=rss",  # Associated Press Business (updated URL)
-    "https://feeds.washingtonpost.com/rss/business",  # Washington Post Business (updated URL)
 ]
 headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
 }
@@ -142,16 +139,12 @@ def fetch_article_content(article_info, company_name, article_limit_reached):
     try:
         if article_limit_reached.is_set():
             return None
-        # Relax the filtering condition
-        if (company_name.lower() in title.lower() or
-            (description and company_name.lower() in description.lower())):
             article_response = requests.get(link, headers=headers, timeout=10)
             article_response.raise_for_status()
             article_soup = BeautifulSoup(article_response.content, "html.parser")
             content = "\n".join(p.text for p in article_soup.find_all("p"))
-            # Relax the content filtering condition
-            if (company_name.lower() in title.lower() or
-                company_name.lower() in content.lower()):
                 print(f"✅ Found article: {title}")
                 return (title, link, content, company_name)
     except requests.RequestException as e:
@@ -162,17 +155,14 @@ def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_
     try:
         if article_limit_reached.is_set():
             return
-        print(f"Fetching RSS feed: {rss_url}")  # Debug log
         response = requests.get(rss_url, headers=headers, timeout=10)
         response.raise_for_status()
-        print(f"Successfully fetched RSS feed: {rss_url}")  # Debug log
         soup = BeautifulSoup(response.content, "xml")
         articles = soup.find_all("item")
         article_infos = [(article.title.text if article.title else "",
                           article.link.text if article.link else "",
                           article.description.text if article.description else "")
                          for article in articles if article.title and article.link]
-        print(f"Found {len(article_infos)} articles in {rss_url}")  # Debug log
         with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
             futures = [executor.submit(fetch_article_content, info, company_name, article_limit_reached)
                        for info in article_infos]
@@ -207,7 +197,6 @@ def get_coverage_differences(articles, company_name):
         }}
       ]
     }}
     """
     try:
         completion = client.chat.completions.create(
@@ -223,26 +212,25 @@ def get_coverage_differences(articles, company_name):
         for chunk in completion:
             coverage_diff += chunk.choices[0].delta.content or ""
-        text = coverage_diff.strip()  # Fixed: removed space between 'text' and '='
         pattern = r'```json\s*([\s\S]*?)\s*```'
         match = re.search(pattern, text)
         if match:
-            json_str = match.group(1)  # Get the content between the markers
             try:
-                # Parse the JSON to verify it's valid and return as dictionary
                 json_dict = json.loads(json_str)
                 json_dict = json.dumps(json_dict, indent=4)
                 return json_dict
             except json.JSONDecodeError as e:
                 return f"Error: Invalid JSON format - {str(e)}"
         else:
-            return "Error: No JSON content found between ```json and ``` markers"
     except Exception as e:
         return f"Error in Groq API call: {str(e)}"
 def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
     keyword_clusters = defaultdict(list)
     for article in processed_articles:
@@ -311,6 +299,7 @@ def comparative_analysis(processed_articles, company_name):
                 deduplicated_unique.add(topic)
         unique_topics[f"Unique Topics in Article {idx+1}"] = deduplicated_unique
     final_sentiment = max(sentiment_summary, key=sentiment_summary.get)
     # Add stock growth expectation based on sentiment
     if final_sentiment == "Positive":
         sentiment_statement = (f"{company_name}’s latest news coverage is mostly {final_sentiment.lower()}. "
@@ -331,76 +320,51 @@ def fetch_and_save_news(company_name):
     if not company_name:
         print("❌ Error: Company name is required")
         return None
     articles = []
     article_limit = 10
     article_queue = queue.Queue()
     article_limit_reached = threading.Event()
-    print(f"🚀 Starting parallel fetching for {company_name}...")
-    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as fetch_executor:
-        fetch_futures = [fetch_executor.submit(
-            fetch_articles_from_rss,
-            url,
-            company_name,
-            article_queue,
-            article_limit_reached
-        ) for url in rss_feeds]
-        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as process_executor:
             processing_futures = []
-            while len(articles) < article_limit:
                 try:
-                    article_data = article_queue.get(timeout=2)
                     future = process_executor.submit(process_article_content, article_data)
                     processing_futures.append(future)
-                    if len(articles) >= article_limit:
-                        article_limit_reached.set()
-                        print("🔥 Immediate termination triggered")
-                        break
                 except queue.Empty:
-                    if all(f.done() for f in fetch_futures):
-                        print("⚠️ All feeds processed before reaching article limit")
-                        break
-            article_limit_reached.set()
-            for f in futures:
-                f.cancel()
             for future in concurrent.futures.as_completed(processing_futures):
                 result = future.result()
-                if result and len(articles) < article_limit:
                     articles.append(result)
-                    print(f"📊 Collected {len(articles)}/{article_limit} articles")
     articles = articles[:article_limit]
     if not articles:
-        print(f"❌ No relevant articles found for {company_name}")
         return None
-    print(f"✅ Processing {len(articles)} articles")
     analysis_result = comparative_analysis(articles, company_name)
     coverage_differences = get_coverage_differences(articles, company_name)
-    if isinstance(coverage_differences, str):
-        try:
-            coverage_differences = json.loads(coverage_differences)
-        except json.JSONDecodeError as e:
-            print(f"❌ Failed to parse Coverage Differences: {e}")
-            coverage_differences = {"Coverage Differences": []}
     sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
     for article in articles:
         sentiment_distribution[article["sentiment"]] += 1
     formatted_articles = [{"Title": article["title"], "Summary": article["summary"],
                            "Sentiment": article["sentiment"], "Topics": article["keywords"].split(", ")}
                           for article in articles]
     output_data = {
         "Company": company_name,
         "Articles": formatted_articles,
@@ -412,9 +376,35 @@ def fetch_and_save_news(company_name):
         },
         "Final Sentiment Analysis": analysis_result['Final Sentiment Analysis']
     }
-    # Return the data directly instead of saving to a file
-    return output_data
 if __name__ == "__main__":
     company_name = input("Enter company name to search for (e.g., Tesla): ")

 import requests
 from bs4 import BeautifulSoup
 import time
     "https://www.economist.com/business/rss.xml",  # The Economist Business
     "https://www.ft.com/companies/financials/rss",  # Financial Times Financials (Visa-relevant)
     "https://www.ft.com/rss/companies/technology",  # Financial Times Tech Companies
+    "https://feeds.a.dj.com/rss/WSJcomUSBusiness.xml",  # Wall Street Journal US Business
+    "https://www.forbes.com/money/feed/",  # Forbes Money
+    "https://www.reuters.com/arc/outboundfeeds/business/?outputType=xml",  # Reuters Business
+    "https://www.bloomberg.com/feed/podcasts/markets.xml",  # Bloomberg Markets
     "https://finance.yahoo.com/news/rssindex",  # Yahoo Finance News
     "https://www.nasdaq.com/feed/rssoutbound",  # Nasdaq News
     "https://www.marketwatch.com/rss/topstories",  # MarketWatch Top Stories
     "https://www.theguardian.com/world/rss",  # The Guardian World
     "https://feeds.npr.org/1001/rss.xml",  # NPR News
     "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",  # NYT Home Page
+    "https://apnews.com/hub/business?format=rss",  # Associated Press Business
+    "https://feeds.washingtonpost.com/rss/business",  # Washington Post Business
 ]
 headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
 }
     try:
         if article_limit_reached.is_set():
             return None
+        if company_name.lower() in title.lower() or (description and company_name.lower() in description.lower()):
             article_response = requests.get(link, headers=headers, timeout=10)
             article_response.raise_for_status()
             article_soup = BeautifulSoup(article_response.content, "html.parser")
             content = "\n".join(p.text for p in article_soup.find_all("p"))
+            if company_name.lower() in title.lower() or company_name.lower() in content.lower():
                 print(f"✅ Found article: {title}")
                 return (title, link, content, company_name)
     except requests.RequestException as e:
     try:
         if article_limit_reached.is_set():
             return
         response = requests.get(rss_url, headers=headers, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, "xml")
         articles = soup.find_all("item")
         article_infos = [(article.title.text if article.title else "",
                           article.link.text if article.link else "",
                           article.description.text if article.description else "")
                          for article in articles if article.title and article.link]
         with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
             futures = [executor.submit(fetch_article_content, info, company_name, article_limit_reached)
                        for info in article_infos]
         }}
       ]
     }}
     """
     try:
         completion = client.chat.completions.create(
         for chunk in completion:
             coverage_diff += chunk.choices[0].delta.content or ""
+        text = coverage_diff.strip()
         pattern = r'```json\s*([\s\S]*?)\s*```'
         match = re.search(pattern, text)
         if match:
+            json_str = match.group(1)
             try:
                 json_dict = json.loads(json_str)
                 json_dict = json.dumps(json_dict, indent=4)
                 return json_dict
             except json.JSONDecodeError as e:
                 return f"Error: Invalid JSON format - {str(e)}"
         else:
+            return "Error: No JSON content found between
+json and
+ markers"
     except Exception as e:
         return f"Error in Groq API call: {str(e)}"
 def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
     keyword_clusters = defaultdict(list)
     for article in processed_articles:
                 deduplicated_unique.add(topic)
         unique_topics[f"Unique Topics in Article {idx+1}"] = deduplicated_unique
     final_sentiment = max(sentiment_summary, key=sentiment_summary.get)
     # Add stock growth expectation based on sentiment
     if final_sentiment == "Positive":
         sentiment_statement = (f"{company_name}’s latest news coverage is mostly {final_sentiment.lower()}. "
     if not company_name:
         print("❌ Error: Company name is required")
         return None
+    file_name = f"{company_name}_news.txt"
     articles = []
+    article_count = 0
     article_limit = 10
+    print(f"🚀 Starting parallel fetching for company: {company_name}...")
     article_queue = queue.Queue()
     article_limit_reached = threading.Event()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as fetch_executor:
+        fetch_futures = [fetch_executor.submit(fetch_articles_from_rss, url, company_name, article_queue, article_limit_reached)
+                         for url in rss_feeds]
+        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as process_executor:
             processing_futures = []
+            while article_count < article_limit and (not article_queue.empty() or not all(f.done() for f in fetch_futures)):
                 try:
+                    article_data = article_queue.get(timeout=0.1)
                     future = process_executor.submit(process_article_content, article_data)
                     processing_futures.append(future)
                 except queue.Empty:
+                    continue
             for future in concurrent.futures.as_completed(processing_futures):
+                if article_count >= article_limit:
+                    article_limit_reached.set()
+                    break
                 result = future.result()
+                if result:
                     articles.append(result)
+                    article_count += 1
+                    print(f"📊 Processed {article_count}/{article_limit} articles")
+                    if article_count >= article_limit:
+                        article_limit_reached.set()
+                        print(f"✅ Reached article limit of {article_limit}. Stopping search.")
+                        break
     articles = articles[:article_limit]
     if not articles:
+        print(f"❌ No relevant articles found for company: {company_name}")
         return None
+    print(f"✅ Saving {len(articles)} articles to {file_name}")
     analysis_result = comparative_analysis(articles, company_name)
     coverage_differences = get_coverage_differences(articles, company_name)
     sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
     for article in articles:
         sentiment_distribution[article["sentiment"]] += 1
     formatted_articles = [{"Title": article["title"], "Summary": article["summary"],
                            "Sentiment": article["sentiment"], "Topics": article["keywords"].split(", ")}
                           for article in articles]
     output_data = {
         "Company": company_name,
         "Articles": formatted_articles,
         },
         "Final Sentiment Analysis": analysis_result['Final Sentiment Analysis']
     }
+    with open(file_name, "w", encoding="utf-8") as file:
+        file.write(f'"Company": "{output_data["Company"]}",\n')
+        file.write('"Articles": [\n')
+        for i, article in enumerate(output_data["Articles"]):
+            file.write('{\n')
+            file.write(f'"Title": "{article["Title"]}",\n')
+            file.write(f'"Summary": "{article["Summary"]}",\n')
+            file.write(f'"Sentiment": "{article["Sentiment"]}",\n')
+            file.write(f'"Topics": {article["Topics"]}\n')
+            file.write('}' + (',\n' if i < len(output_data["Articles"]) - 1 else '\n'))
+        file.write('],\n')
+        file.write('"Comparative Sentiment Score": {\n')
+        file.write('"Sentiment Distribution": {\n')
+        for i, (sentiment, count) in enumerate(output_data["Comparative Sentiment Score"]["Sentiment Distribution"].items()):
+            file.write(f'"{sentiment}": {count}' + (',' if i < 2 else '') + '\n')
+        file.write('}\n')
+        file.write('},\n')
+        file.write(f'{output_data["Coverage Differences"]},\n')
+        file.write('"Topic Overlap": {\n')
+        file.write(f'"Common Topics": {output_data["Topic Overlap"]["Common Topics"]},\n')
+        for i, (key, value) in enumerate([(k, v) for k, v in output_data["Topic Overlap"].items() if k != "Common Topics"]):
+            file.write(f'"{key}": {value}' + (',\n' if i < len(output_data["Topic Overlap"]) - 2 else '\n'))
+        file.write('},\n')
+        file.write(f'"Final Sentiment Analysis": "{output_data["Final Sentiment Analysis"]}"\n')
+    print("\nOutput format:")
+    with open(file_name, "r", encoding="utf-8") as file:
+        print(file.read())
+    print("✅ File saved successfully!")
+    return file_name
 if __name__ == "__main__":
     company_name = input("Enter company name to search for (e.g., Tesla): ")