Sasmitah commited on
Commit
c7c2ef5
Β·
verified Β·
1 Parent(s): eb79ec9

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +14 -8
  2. app.py +80 -0
  3. requirements.txt +20 -0
  4. utils.py +409 -0
README.md CHANGED
@@ -1,14 +1,20 @@
1
  ---
2
- title: NewsSummarizationTTS1
3
- emoji: 🐨
4
- colorFrom: pink
5
- colorTo: red
6
  sdk: streamlit
7
- sdk_version: 1.43.2
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
- short_description: NewsSummarizationTTS
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: News Summarization App
3
+ emoji: πŸ“°
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: streamlit
7
+ sdk_version: 1.36.0
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
+ # News Summarization App
13
+
14
+ This app fetches news articles about a company, summarizes them, analyzes sentiment, and provides a Hindi audio translation of the sentiment analysis.
15
+
16
+ ## Requirements
17
+ - See `requirements.txt` for dependencies.
18
+
19
+ ## Backend
20
+ - The FastAPI backend (`api.py`) runs alongside the Streamlit app to handle news fetching and processing.
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import utils # Import functions from utils.py
3
+ import os
4
+ from gtts import gTTS
5
+ import tempfile
6
+ import re
7
+ from deep_translator import GoogleTranslator
8
+
9
+ st.title("News Summarization and Text-to-Speech Application")
10
+
11
+ # User input for company name
12
+ company_name = st.text_input("Enter the company name:", "").strip().lower()
13
+
14
+ if st.button("Fetch News"):
15
+ if company_name:
16
+ # Run news extraction and analysis
17
+ st.write(f"Fetching news for **{company_name}**...")
18
+
19
+ # Call the function from utils.py
20
+ file_name = utils.fetch_and_save_news(company_name)
21
+
22
+ if os.path.exists(file_name):
23
+ st.success(f"Data saved in **{file_name}**")
24
+
25
+ # Read the file to display content
26
+ with open(file_name, "r", encoding="utf-8") as file:
27
+ text_content = file.read()
28
+ st.text_area("News Analysis", text_content, height=400)
29
+
30
+ # Provide a download button for text file
31
+ with open(file_name, "rb") as file:
32
+ st.download_button(
33
+ label="Download Text File",
34
+ data=file,
35
+ file_name=file_name,
36
+ mime="text/plain"
37
+ )
38
+
39
+ # Extract only the Final Sentiment Analysis line
40
+ final_sentiment_line = ""
41
+ with open(file_name, "r", encoding="utf-8") as file:
42
+ content = file.read()
43
+ # Use regular expression to find the Final Sentiment Analysis line
44
+ match = re.search(r'"Final Sentiment Analysis": "([^"]+)"', content)
45
+ if match:
46
+ final_sentiment_line = match.group(1)
47
+
48
+ if final_sentiment_line:
49
+ st.subheader("Hindi Audio for Final Sentiment Analysis")
50
+
51
+ try:
52
+ # First translate the English text to Hindi using deep_translator
53
+ translator = GoogleTranslator(source='en', target='hi')
54
+ hindi_text = translator.translate(final_sentiment_line)
55
+
56
+ # Create Hindi audio from the translated text
57
+ tts = gTTS(text=hindi_text, lang='hi', slow=False)
58
+
59
+ # Save the audio in a temporary file
60
+ temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
61
+ tts.save(temp_audio_file.name)
62
+
63
+
64
+ # Provide download button for the audio
65
+ with open(temp_audio_file.name, "rb") as audio_file:
66
+ audio_data = audio_file.read()
67
+ st.download_button(
68
+ label="Download Hindi Audio",
69
+ data=audio_data,
70
+ file_name=f"{company_name}_sentiment_hindi.mp3",
71
+ mime="audio/mp3"
72
+ )
73
+ except Exception as e:
74
+ st.error(f"Error generating Hindi audio: {str(e)}")
75
+ else:
76
+ st.warning("Could not find Final Sentiment Analysis in the text.")
77
+ else:
78
+ st.error("No relevant news articles found.")
79
+ else:
80
+ st.warning("Please enter a company name.")
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ requests==2.32.3
3
+ beautifulsoup4==4.13.3
4
+ transformers==4.49.0
5
+ torch==2.2.2
6
+ keybert==0.9.0
7
+ spacy==3.8.3
8
+ nltk==3.9.1
9
+ groq==0.18.0
10
+ httpx==0.23.0
11
+ sentencepiece==0.2.0
12
+ streamlit==1.43.2
13
+ fastapi==0.115.11
14
+ pydantic==2.10.6
15
+ uvicorn==0.34.0
16
+ deep-translator==1.11.4
17
+ gtts==2.5.4
18
+ scikit-learn==1.6.1
19
+ protobuf==3.20.3
20
+ sentence-transformers==3.4.1
utils.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import time
4
+ import concurrent.futures
5
+ import threading
6
+ from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
7
+ from keybert import KeyBERT
8
+ import queue
9
+ from collections import defaultdict
10
+ import spacy
11
+ import nltk
12
+ from nltk.sentiment.vader import SentimentIntensityAnalyzer
13
+ from groq import Groq
14
+ import json
15
+ import re
16
+
17
+ nltk.download('vader_lexicon')
18
+
19
+ # Initialize sentiment analyzer
20
+ sid = SentimentIntensityAnalyzer()
21
+
22
+ # Load models once
23
+ tokenizer = T5Tokenizer.from_pretrained("t5-small")
24
+ model = T5ForConditionalGeneration.from_pretrained("t5-small")
25
+ sentiment_analyzer = pipeline("sentiment-analysis")
26
+ kw_model = KeyBERT()
27
+
28
+ # Load spaCy model
29
+ try:
30
+ nlp = spacy.load("en_core_web_md")
31
+ except OSError:
32
+ print("Downloading 'en_core_web_md' model...")
33
+ import spacy.cli
34
+ spacy.cli.download("en_core_web_md")
35
+ nlp = spacy.load("en_core_web_md")
36
+
37
+ # Initialize Groq client
38
+ client = Groq(api_key="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")
39
+
40
+ # RSS Feeds
41
+ rss_feeds = [
42
+ # Technology-focused feeds (general tech news, some may cover Visa tech initiatives)
43
+ "https://feeds.bbci.co.uk/news/technology/rss.xml", # BBC Technology
44
+ "https://www.cnbc.com/id/19854910/device/rss/rss.html", # CNBC Tech
45
+ "https://www.theverge.com/rss/index.xml", # The Verge
46
+ "https://feeds.arstechnica.com/arstechnica/index", # Ars Technica
47
+ "https://www.engadget.com/rss.xml", # Engadget
48
+ "https://techcrunch.com/feed/", # TechCrunch
49
+ "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml", # NYT Technology
50
+ "https://www.wired.com/feed/rss", # Wired
51
+ "https://www.zdnet.com/news/rss.xml", # ZDNet News
52
+ "https://www.cnet.com/rss/news/", # CNET News
53
+ "https://www.digitaltrends.com/feed/", # Digital Trends
54
+ "https://www.techmeme.com/feed.xml", # Techmeme
55
+ "https://www.technologyreview.com/feed/", # MIT Technology Review
56
+ "https://www.pcworld.com/feed", # PCWorld
57
+ "https://venturebeat.com/feed/", # VentureBeat
58
+
59
+ # Business and Finance feeds (more likely to cover Visa)
60
+ "https://feeds.bbci.co.uk/news/business/rss.xml", # BBC Business
61
+ "https://www.cnbc.com/id/10001147/device/rss/rss.html", # CNBC Business
62
+ "https://www.economist.com/business/rss.xml", # The Economist Business
63
+ "https://www.ft.com/companies/financials/rss", # Financial Times Financials (Visa-relevant)
64
+ "https://www.ft.com/rss/companies/technology", # Financial Times Tech Companies
65
+ "https://feeds.a.dj.com/rss/WSJcomUSBusiness.xml", # Wall Street Journal US Business
66
+ "https://www.forbes.com/money/feed/", # Forbes Money
67
+ "https://www.reuters.com/arc/outboundfeeds/business/?outputType=xml", # Reuters Business
68
+ "https://www.bloomberg.com/feed/podcasts/markets.xml", # Bloomberg Markets
69
+ "https://finance.yahoo.com/news/rssindex", # Yahoo Finance News
70
+ "https://www.nasdaq.com/feed/rssoutbound", # Nasdaq News
71
+ "https://www.marketwatch.com/rss/topstories", # MarketWatch Top Stories
72
+ "https://www.investing.com/rss/news.rss", # Investing.com News
73
+
74
+ # General news (reliable sources that may cover Visa)
75
+ "https://feeds.bbci.co.uk/news/rss.xml", # BBC News
76
+ "https://www.aljazeera.com/xml/rss/all.xml", # Al Jazeera
77
+ "https://www.theguardian.com/world/rss", # The Guardian World
78
+ "https://feeds.npr.org/1001/rss.xml", # NPR News
79
+ "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml", # NYT Home Page
80
+ "https://apnews.com/hub/business?format=rss", # Associated Press Business
81
+ "https://feeds.washingtonpost.com/rss/business", # Washington Post Business
82
+ ]
83
+
84
+ headers = {
85
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
86
+ }
87
+
88
+ # Locks for thread safety
89
+ model_lock = threading.Lock()
90
+ sentiment_lock = threading.Lock()
91
+ keyword_lock = threading.Lock()
92
+
93
+ def summarize_t5(text, max_length=100, min_length=30):
94
+ with model_lock:
95
+ inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
96
+ summary_ids = model.generate(
97
+ inputs.input_ids,
98
+ max_length=max_length,
99
+ min_length=min_length,
100
+ length_penalty=2.0,
101
+ num_beams=4,
102
+ early_stopping=True
103
+ )
104
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
105
+
106
+ def analyze_sentiment(text):
107
+ with sentiment_lock:
108
+ result = sentiment_analyzer(text[:512])[0]
109
+ label = result["label"].lower()
110
+ return "Positive" if label == "positive" else "Negative" if label == "negative" else "Neutral"
111
+
112
+ def extract_keywords(text):
113
+ with keyword_lock:
114
+ return ", ".join([kw[0] for kw in kw_model.extract_keywords(text, top_n=5)])
115
+
116
+ def process_article_content(article_data):
117
+ try:
118
+ title, link, content, company_name = article_data
119
+ with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
120
+ summary_future = executor.submit(summarize_t5, content)
121
+ sentiment_future = executor.submit(analyze_sentiment, content)
122
+ keywords_future = executor.submit(extract_keywords, content)
123
+ summary_text = summary_future.result()
124
+ sentiment = sentiment_future.result()
125
+ keywords = keywords_future.result()
126
+ return {
127
+ "title": title,
128
+ "link": link,
129
+ "summary": summary_text,
130
+ "sentiment": sentiment,
131
+ "keywords": keywords
132
+ }
133
+ except Exception as e:
134
+ print(f"❌ Error processing article {title}: {e}")
135
+ return None
136
+
137
+ def fetch_article_content(article_info, company_name, article_limit_reached):
138
+ title, link, description = article_info
139
+ try:
140
+ if article_limit_reached.is_set():
141
+ return None
142
+ if company_name.lower() in title.lower() or (description and company_name.lower() in description.lower()):
143
+ article_response = requests.get(link, headers=headers, timeout=10)
144
+ article_response.raise_for_status()
145
+ article_soup = BeautifulSoup(article_response.content, "html.parser")
146
+ content = "\n".join(p.text for p in article_soup.find_all("p"))
147
+ if company_name.lower() in title.lower() or company_name.lower() in content.lower():
148
+ print(f"βœ… Found article: {title}")
149
+ return (title, link, content, company_name)
150
+ except requests.RequestException as e:
151
+ print(f"❌ Failed to retrieve content for: {title} - {e}")
152
+ return None
153
+
154
+ def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_reached):
155
+ try:
156
+ if article_limit_reached.is_set():
157
+ return
158
+ response = requests.get(rss_url, headers=headers, timeout=10)
159
+ response.raise_for_status()
160
+ soup = BeautifulSoup(response.content, "xml")
161
+ articles = soup.find_all("item")
162
+ article_infos = [(article.title.text if article.title else "",
163
+ article.link.text if article.link else "",
164
+ article.description.text if article.description else "")
165
+ for article in articles if article.title and article.link]
166
+ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
167
+ futures = [executor.submit(fetch_article_content, info, company_name, article_limit_reached)
168
+ for info in article_infos]
169
+ for future in concurrent.futures.as_completed(futures):
170
+ if article_limit_reached.is_set():
171
+ return
172
+ result = future.result()
173
+ if result:
174
+ article_queue.put(result)
175
+ except requests.RequestException as e:
176
+ print(f"❌ Failed to fetch RSS feed: {rss_url} - {e}")
177
+
178
+ def get_coverage_differences(articles, company_name):
179
+ """Fetch coverage differences using Groq API."""
180
+ articles_summary = "\n".join([f"Article {i+1}: Title: {a['title']}, Summary: {a['summary']}, Sentiment: {a['sentiment']}, Keywords: {a['keywords']}"
181
+ for i, a in enumerate(articles)])
182
+ prompt = f"""
183
+ Analyze the following ten articles about {company_name} and generate a comparative coverage analysis:
184
+ 1. Compare articles based on their main topics.
185
+ 2. Identify coverage differences between positive and negative articles.
186
+ 3. Provide insights into how these differences impact {company_name}'s market, mentioning article numbers clearly.
187
+
188
+ Articles:
189
+ {articles_summary}
190
+
191
+ Generate a JSON output in the following format:
192
+ {{
193
+ "Coverage Differences": [
194
+ {{
195
+ "Comparison": "Summary of key differences between two articles.",
196
+ "Impact": "Explanation of how these differences affect {company_name}'s market perception."
197
+ }}
198
+ ]
199
+ }}
200
+ """
201
+ try:
202
+ completion = client.chat.completions.create(
203
+ model="llama-3.3-70b-versatile",
204
+ messages=[{"role": "user", "content": prompt}],
205
+ temperature=1,
206
+ max_completion_tokens=1024,
207
+ top_p=1,
208
+ stream=True,
209
+ stop=None,
210
+ )
211
+ coverage_diff = ""
212
+ for chunk in completion:
213
+ coverage_diff += chunk.choices[0].delta.content or ""
214
+
215
+ text = coverage_diff.strip()
216
+ pattern = r'json\s*([\s\S]*?)\s*'
217
+ match = re.search(pattern, text)
218
+
219
+ if match:
220
+ json_str = match.group(1)
221
+ try:
222
+ json_dict = json.loads(json_str)
223
+ json_dict = json.dumps(json_dict, indent=4)
224
+ return json_dict
225
+ except json.JSONDecodeError as e:
226
+ return f"Error: Invalid JSON format - {str(e)}"
227
+ else:
228
+ return "Error: No JSON content found between json and markers"
229
+ except Exception as e:
230
+ return f"Error in Groq API call: {str(e)}"
231
+
232
+ def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
233
+ keyword_clusters = defaultdict(list)
234
+ for article in processed_articles:
235
+ keywords = article["keywords"].split(", ")
236
+ for keyword in keywords:
237
+ if not nlp(keyword).has_vector:
238
+ continue
239
+ added = False
240
+ for cluster_key in list(keyword_clusters.keys()):
241
+ if nlp(keyword).similarity(nlp(cluster_key)) >= similarity_threshold:
242
+ keyword_clusters[cluster_key].append(keyword)
243
+ added = True
244
+ break
245
+ if not added:
246
+ keyword_clusters[keyword].append(keyword)
247
+ deduplicated_clusters = {min(keywords, key=len): keywords for cluster_key, keywords in keyword_clusters.items()}
248
+ common_topics = []
249
+ article_keyword_sets = [set(a["keywords"].split(", ")) for a in processed_articles]
250
+ for representative, cluster in deduplicated_clusters.items():
251
+ articles_with_cluster = sum(1 for keyword_set in article_keyword_sets
252
+ if any(kw in keyword_set for kw in cluster))
253
+ if articles_with_cluster >= min_articles:
254
+ common_topics.append(representative)
255
+ final_common_topics = []
256
+ for topic in common_topics:
257
+ if not nlp(topic).has_vector:
258
+ final_common_topics.append(topic)
259
+ continue
260
+ is_similar = False
261
+ for added_topic in list(final_common_topics):
262
+ if nlp(topic).similarity(nlp(added_topic)) >= similarity_threshold:
263
+ is_similar = True
264
+ if len(topic) < len(added_topic):
265
+ final_common_topics.remove(added_topic)
266
+ final_common_topics.append(topic)
267
+ break
268
+ if not is_similar:
269
+ final_common_topics.append(topic)
270
+ return final_common_topics
271
+
272
+ def comparative_analysis(processed_articles, company_name):
273
+ sentiment_summary = {"Positive": 0, "Negative": 0, "Neutral": 0}
274
+ all_keywords = []
275
+ for idx, article in enumerate(processed_articles):
276
+ sentiment_summary[article["sentiment"]] += 1
277
+ keywords = set(article["keywords"].split(", "))
278
+ all_keywords.append((idx, keywords))
279
+ common_topics = similarity_based_common_topics(processed_articles)
280
+ unique_topics = {}
281
+ for idx, topics in all_keywords:
282
+ unique = topics - set(common_topics)
283
+ deduplicated_unique = set()
284
+ for topic in unique:
285
+ if not nlp(topic).has_vector:
286
+ deduplicated_unique.add(topic)
287
+ continue
288
+ is_similar = False
289
+ for added_topic in list(deduplicated_unique):
290
+ if nlp(topic).similarity(nlp(added_topic)) >= 0.8:
291
+ is_similar = True
292
+ if len(topic) < len(added_topic):
293
+ deduplicated_unique.remove(added_topic)
294
+ deduplicated_unique.add(topic)
295
+ break
296
+ if not is_similar:
297
+ deduplicated_unique.add(topic)
298
+ unique_topics[f"Unique Topics in Article {idx+1}"] = deduplicated_unique
299
+ final_sentiment = max(sentiment_summary, key=sentiment_summary.get)
300
+
301
+ # Add stock growth expectation based on sentiment
302
+ if final_sentiment == "Positive":
303
+ sentiment_statement = (f"{company_name}’s latest news coverage is mostly {final_sentiment.lower()}. "
304
+ f"This positive sentiment suggests potential stock growth as investor confidence may increase.")
305
+ elif final_sentiment == "Negative":
306
+ sentiment_statement = (f"{company_name}’s latest news coverage is mostly {final_sentiment.lower()}. "
307
+ f"This negative sentiment suggests potential stock decline as investor confidence may weaken.")
308
+ else: # Neutral
309
+ sentiment_statement = (f"{company_name}’s latest news coverage is mostly {final_sentiment.lower()}. "
310
+ f"This neutral sentiment suggests limited immediate impact on stock value, with potential for stability unless new developments shift perceptions.")
311
+
312
+ return {
313
+ "Topic Overlap": {"Common Topics": common_topics, **unique_topics},
314
+ "Final Sentiment Analysis": sentiment_statement
315
+ }
316
+
317
+ def fetch_and_save_news(company_name):
318
+ if not company_name:
319
+ print("❌ Error: Company name is required")
320
+ return None
321
+ file_name = f"{company_name}_news.txt"
322
+ articles = []
323
+ article_count = 0
324
+ article_limit = 10
325
+ print(f"πŸš€ Starting parallel fetching for company: {company_name}...")
326
+ article_queue = queue.Queue()
327
+ article_limit_reached = threading.Event()
328
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as fetch_executor:
329
+ fetch_futures = [fetch_executor.submit(fetch_articles_from_rss, url, company_name, article_queue, article_limit_reached)
330
+ for url in rss_feeds]
331
+ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as process_executor:
332
+ processing_futures = []
333
+ while article_count < article_limit and (not article_queue.empty() or not all(f.done() for f in fetch_futures)):
334
+ try:
335
+ article_data = article_queue.get(timeout=0.1)
336
+ future = process_executor.submit(process_article_content, article_data)
337
+ processing_futures.append(future)
338
+ except queue.Empty:
339
+ continue
340
+ for future in concurrent.futures.as_completed(processing_futures):
341
+ if article_count >= article_limit:
342
+ article_limit_reached.set()
343
+ break
344
+ result = future.result()
345
+ if result:
346
+ articles.append(result)
347
+ article_count += 1
348
+ print(f"πŸ“Š Processed {article_count}/{article_limit} articles")
349
+ if article_count >= article_limit:
350
+ article_limit_reached.set()
351
+ print(f"βœ… Reached article limit of {article_limit}. Stopping search.")
352
+ break
353
+ articles = articles[:article_limit]
354
+ if not articles:
355
+ print(f"❌ No relevant articles found for company: {company_name}")
356
+ return None
357
+ print(f"βœ… Saving {len(articles)} articles to {file_name}")
358
+ analysis_result = comparative_analysis(articles, company_name)
359
+ coverage_differences = get_coverage_differences(articles, company_name)
360
+ sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
361
+ for article in articles:
362
+ sentiment_distribution[article["sentiment"]] += 1
363
+ formatted_articles = [{"Title": article["title"], "Summary": article["summary"],
364
+ "Sentiment": article["sentiment"], "Topics": article["keywords"].split(", ")}
365
+ for article in articles]
366
+ output_data = {
367
+ "Company": company_name,
368
+ "Articles": formatted_articles,
369
+ "Comparative Sentiment Score": {"Sentiment Distribution": sentiment_distribution},
370
+ "Coverage Differences": coverage_differences,
371
+ "Topic Overlap": {
372
+ "Common Topics": analysis_result['Topic Overlap']['Common Topics'],
373
+ **{k: list(v) for k, v in analysis_result['Topic Overlap'].items() if k != "Common Topics"}
374
+ },
375
+ "Final Sentiment Analysis": analysis_result['Final Sentiment Analysis']
376
+ }
377
+ with open(file_name, "w", encoding="utf-8") as file:
378
+ file.write(f'"Company": "{output_data["Company"]}",\n')
379
+ file.write('"Articles": [\n')
380
+ for i, article in enumerate(output_data["Articles"]):
381
+ file.write('{\n')
382
+ file.write(f'"Title": "{article["Title"]}",\n')
383
+ file.write(f'"Summary": "{article["Summary"]}",\n')
384
+ file.write(f'"Sentiment": "{article["Sentiment"]}",\n')
385
+ file.write(f'"Topics": {article["Topics"]}\n')
386
+ file.write('}' + (',\n' if i < len(output_data["Articles"]) - 1 else '\n'))
387
+ file.write('],\n')
388
+ file.write('"Comparative Sentiment Score": {\n')
389
+ file.write('"Sentiment Distribution": {\n')
390
+ for i, (sentiment, count) in enumerate(output_data["Comparative Sentiment Score"]["Sentiment Distribution"].items()):
391
+ file.write(f'"{sentiment}": {count}' + (',' if i < 2 else '') + '\n')
392
+ file.write('}\n')
393
+ file.write('},\n')
394
+ file.write(f'{output_data["Coverage Differences"]},\n')
395
+ file.write('"Topic Overlap": {\n')
396
+ file.write(f'"Common Topics": {output_data["Topic Overlap"]["Common Topics"]},\n')
397
+ for i, (key, value) in enumerate([(k, v) for k, v in output_data["Topic Overlap"].items() if k != "Common Topics"]):
398
+ file.write(f'"{key}": {value}' + (',\n' if i < len(output_data["Topic Overlap"]) - 2 else '\n'))
399
+ file.write('},\n')
400
+ file.write(f'"Final Sentiment Analysis": "{output_data["Final Sentiment Analysis"]}"\n')
401
+ print("\nOutput format:")
402
+ with open(file_name, "r", encoding="utf-8") as file:
403
+ print(file.read())
404
+ print("βœ… File saved successfully!")
405
+ return file_name
406
+
407
+ if __name__ == "__main__":
408
+ company_name = input("Enter company name to search for (e.g., Tesla): ")
409
+ fetch_and_save_news(company_name)