Sasmita Harini commited on
Commit
3bae43a
Β·
1 Parent(s): 440ee04

Run FastAPI as subprocess in app.py

Browse files
Files changed (1) hide show
  1. utils.py +65 -75
utils.py CHANGED
@@ -1,5 +1,3 @@
1
- # utils.py
2
-
3
  import requests
4
  from bs4 import BeautifulSoup
5
  import time
@@ -64,10 +62,10 @@ rss_feeds = [
64
  "https://www.economist.com/business/rss.xml", # The Economist Business
65
  "https://www.ft.com/companies/financials/rss", # Financial Times Financials (Visa-relevant)
66
  "https://www.ft.com/rss/companies/technology", # Financial Times Tech Companies
67
- "https://feeds.a.dj.com/rss/WSJcomUSBusiness.xml", # Wall Street Journal US Business (updated URL)
68
- "https://www.forbes.com/money/feed/", # Forbes Money (updated URL)
69
- "https://www.reuters.com/arc/outboundfeeds/business/?outputType=xml", # Reuters Business (updated URL)
70
- "https://www.bloomberg.com/feed/podcasts/markets.xml", # Bloomberg Markets (updated URL)
71
  "https://finance.yahoo.com/news/rssindex", # Yahoo Finance News
72
  "https://www.nasdaq.com/feed/rssoutbound", # Nasdaq News
73
  "https://www.marketwatch.com/rss/topstories", # MarketWatch Top Stories
@@ -79,11 +77,10 @@ rss_feeds = [
79
  "https://www.theguardian.com/world/rss", # The Guardian World
80
  "https://feeds.npr.org/1001/rss.xml", # NPR News
81
  "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml", # NYT Home Page
82
- "https://apnews.com/hub/business?format=rss", # Associated Press Business (updated URL)
83
- "https://feeds.washingtonpost.com/rss/business", # Washington Post Business (updated URL)
84
  ]
85
 
86
-
87
  headers = {
88
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
89
  }
@@ -142,16 +139,12 @@ def fetch_article_content(article_info, company_name, article_limit_reached):
142
  try:
143
  if article_limit_reached.is_set():
144
  return None
145
- # Relax the filtering condition
146
- if (company_name.lower() in title.lower() or
147
- (description and company_name.lower() in description.lower())):
148
  article_response = requests.get(link, headers=headers, timeout=10)
149
  article_response.raise_for_status()
150
  article_soup = BeautifulSoup(article_response.content, "html.parser")
151
  content = "\n".join(p.text for p in article_soup.find_all("p"))
152
- # Relax the content filtering condition
153
- if (company_name.lower() in title.lower() or
154
- company_name.lower() in content.lower()):
155
  print(f"βœ… Found article: {title}")
156
  return (title, link, content, company_name)
157
  except requests.RequestException as e:
@@ -162,17 +155,14 @@ def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_
162
  try:
163
  if article_limit_reached.is_set():
164
  return
165
- print(f"Fetching RSS feed: {rss_url}") # Debug log
166
  response = requests.get(rss_url, headers=headers, timeout=10)
167
  response.raise_for_status()
168
- print(f"Successfully fetched RSS feed: {rss_url}") # Debug log
169
  soup = BeautifulSoup(response.content, "xml")
170
  articles = soup.find_all("item")
171
  article_infos = [(article.title.text if article.title else "",
172
  article.link.text if article.link else "",
173
  article.description.text if article.description else "")
174
  for article in articles if article.title and article.link]
175
- print(f"Found {len(article_infos)} articles in {rss_url}") # Debug log
176
  with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
177
  futures = [executor.submit(fetch_article_content, info, company_name, article_limit_reached)
178
  for info in article_infos]
@@ -207,7 +197,6 @@ def get_coverage_differences(articles, company_name):
207
  }}
208
  ]
209
  }}
210
-
211
  """
212
  try:
213
  completion = client.chat.completions.create(
@@ -223,26 +212,25 @@ def get_coverage_differences(articles, company_name):
223
  for chunk in completion:
224
  coverage_diff += chunk.choices[0].delta.content or ""
225
 
226
- text = coverage_diff.strip() # Fixed: removed space between 'text' and '='
227
  pattern = r'```json\s*([\s\S]*?)\s*```'
228
  match = re.search(pattern, text)
229
 
230
  if match:
231
- json_str = match.group(1) # Get the content between the markers
232
  try:
233
- # Parse the JSON to verify it's valid and return as dictionary
234
  json_dict = json.loads(json_str)
235
  json_dict = json.dumps(json_dict, indent=4)
236
  return json_dict
237
  except json.JSONDecodeError as e:
238
  return f"Error: Invalid JSON format - {str(e)}"
239
  else:
240
- return "Error: No JSON content found between ```json and ``` markers"
 
 
241
  except Exception as e:
242
  return f"Error in Groq API call: {str(e)}"
243
 
244
-
245
-
246
  def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
247
  keyword_clusters = defaultdict(list)
248
  for article in processed_articles:
@@ -311,6 +299,7 @@ def comparative_analysis(processed_articles, company_name):
311
  deduplicated_unique.add(topic)
312
  unique_topics[f"Unique Topics in Article {idx+1}"] = deduplicated_unique
313
  final_sentiment = max(sentiment_summary, key=sentiment_summary.get)
 
314
  # Add stock growth expectation based on sentiment
315
  if final_sentiment == "Positive":
316
  sentiment_statement = (f"{company_name}’s latest news coverage is mostly {final_sentiment.lower()}. "
@@ -331,76 +320,51 @@ def fetch_and_save_news(company_name):
331
  if not company_name:
332
  print("❌ Error: Company name is required")
333
  return None
334
-
335
  articles = []
 
336
  article_limit = 10
 
337
  article_queue = queue.Queue()
338
  article_limit_reached = threading.Event()
339
-
340
- print(f"πŸš€ Starting parallel fetching for {company_name}...")
341
-
342
- with concurrent.futures.ThreadPoolExecutor(max_workers=20) as fetch_executor:
343
- fetch_futures = [fetch_executor.submit(
344
- fetch_articles_from_rss,
345
- url,
346
- company_name,
347
- article_queue,
348
- article_limit_reached
349
- ) for url in rss_feeds]
350
-
351
- with concurrent.futures.ThreadPoolExecutor(max_workers=10) as process_executor:
352
  processing_futures = []
353
-
354
- while len(articles) < article_limit:
355
  try:
356
- article_data = article_queue.get(timeout=2)
357
  future = process_executor.submit(process_article_content, article_data)
358
  processing_futures.append(future)
359
-
360
- if len(articles) >= article_limit:
361
- article_limit_reached.set()
362
- print("πŸ”₯ Immediate termination triggered")
363
- break
364
-
365
  except queue.Empty:
366
- if all(f.done() for f in fetch_futures):
367
- print("⚠️ All feeds processed before reaching article limit")
368
- break
369
-
370
- article_limit_reached.set()
371
- for f in futures:
372
- f.cancel()
373
-
374
  for future in concurrent.futures.as_completed(processing_futures):
 
 
 
375
  result = future.result()
376
- if result and len(articles) < article_limit:
377
  articles.append(result)
378
- print(f"πŸ“Š Collected {len(articles)}/{article_limit} articles")
379
-
 
 
 
 
380
  articles = articles[:article_limit]
381
  if not articles:
382
- print(f"❌ No relevant articles found for {company_name}")
383
  return None
384
-
385
- print(f"βœ… Processing {len(articles)} articles")
386
  analysis_result = comparative_analysis(articles, company_name)
387
  coverage_differences = get_coverage_differences(articles, company_name)
388
-
389
- if isinstance(coverage_differences, str):
390
- try:
391
- coverage_differences = json.loads(coverage_differences)
392
- except json.JSONDecodeError as e:
393
- print(f"❌ Failed to parse Coverage Differences: {e}")
394
- coverage_differences = {"Coverage Differences": []}
395
-
396
  sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
397
  for article in articles:
398
  sentiment_distribution[article["sentiment"]] += 1
399
-
400
  formatted_articles = [{"Title": article["title"], "Summary": article["summary"],
401
  "Sentiment": article["sentiment"], "Topics": article["keywords"].split(", ")}
402
  for article in articles]
403
-
404
  output_data = {
405
  "Company": company_name,
406
  "Articles": formatted_articles,
@@ -412,9 +376,35 @@ def fetch_and_save_news(company_name):
412
  },
413
  "Final Sentiment Analysis": analysis_result['Final Sentiment Analysis']
414
  }
415
-
416
- # Return the data directly instead of saving to a file
417
- return output_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
  if __name__ == "__main__":
420
  company_name = input("Enter company name to search for (e.g., Tesla): ")
 
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  import time
 
62
  "https://www.economist.com/business/rss.xml", # The Economist Business
63
  "https://www.ft.com/companies/financials/rss", # Financial Times Financials (Visa-relevant)
64
  "https://www.ft.com/rss/companies/technology", # Financial Times Tech Companies
65
+ "https://feeds.a.dj.com/rss/WSJcomUSBusiness.xml", # Wall Street Journal US Business
66
+ "https://www.forbes.com/money/feed/", # Forbes Money
67
+ "https://www.reuters.com/arc/outboundfeeds/business/?outputType=xml", # Reuters Business
68
+ "https://www.bloomberg.com/feed/podcasts/markets.xml", # Bloomberg Markets
69
  "https://finance.yahoo.com/news/rssindex", # Yahoo Finance News
70
  "https://www.nasdaq.com/feed/rssoutbound", # Nasdaq News
71
  "https://www.marketwatch.com/rss/topstories", # MarketWatch Top Stories
 
77
  "https://www.theguardian.com/world/rss", # The Guardian World
78
  "https://feeds.npr.org/1001/rss.xml", # NPR News
79
  "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml", # NYT Home Page
80
+ "https://apnews.com/hub/business?format=rss", # Associated Press Business
81
+ "https://feeds.washingtonpost.com/rss/business", # Washington Post Business
82
  ]
83
 
 
84
  headers = {
85
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
86
  }
 
139
  try:
140
  if article_limit_reached.is_set():
141
  return None
142
+ if company_name.lower() in title.lower() or (description and company_name.lower() in description.lower()):
 
 
143
  article_response = requests.get(link, headers=headers, timeout=10)
144
  article_response.raise_for_status()
145
  article_soup = BeautifulSoup(article_response.content, "html.parser")
146
  content = "\n".join(p.text for p in article_soup.find_all("p"))
147
+ if company_name.lower() in title.lower() or company_name.lower() in content.lower():
 
 
148
  print(f"βœ… Found article: {title}")
149
  return (title, link, content, company_name)
150
  except requests.RequestException as e:
 
155
  try:
156
  if article_limit_reached.is_set():
157
  return
 
158
  response = requests.get(rss_url, headers=headers, timeout=10)
159
  response.raise_for_status()
 
160
  soup = BeautifulSoup(response.content, "xml")
161
  articles = soup.find_all("item")
162
  article_infos = [(article.title.text if article.title else "",
163
  article.link.text if article.link else "",
164
  article.description.text if article.description else "")
165
  for article in articles if article.title and article.link]
 
166
  with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
167
  futures = [executor.submit(fetch_article_content, info, company_name, article_limit_reached)
168
  for info in article_infos]
 
197
  }}
198
  ]
199
  }}
 
200
  """
201
  try:
202
  completion = client.chat.completions.create(
 
212
  for chunk in completion:
213
  coverage_diff += chunk.choices[0].delta.content or ""
214
 
215
+ text = coverage_diff.strip()
216
  pattern = r'```json\s*([\s\S]*?)\s*```'
217
  match = re.search(pattern, text)
218
 
219
  if match:
220
+ json_str = match.group(1)
221
  try:
 
222
  json_dict = json.loads(json_str)
223
  json_dict = json.dumps(json_dict, indent=4)
224
  return json_dict
225
  except json.JSONDecodeError as e:
226
  return f"Error: Invalid JSON format - {str(e)}"
227
  else:
228
+ return "Error: No JSON content found between
229
+ json and
230
+ markers"
231
  except Exception as e:
232
  return f"Error in Groq API call: {str(e)}"
233
 
 
 
234
  def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
235
  keyword_clusters = defaultdict(list)
236
  for article in processed_articles:
 
299
  deduplicated_unique.add(topic)
300
  unique_topics[f"Unique Topics in Article {idx+1}"] = deduplicated_unique
301
  final_sentiment = max(sentiment_summary, key=sentiment_summary.get)
302
+
303
  # Add stock growth expectation based on sentiment
304
  if final_sentiment == "Positive":
305
  sentiment_statement = (f"{company_name}’s latest news coverage is mostly {final_sentiment.lower()}. "
 
320
  if not company_name:
321
  print("❌ Error: Company name is required")
322
  return None
323
+ file_name = f"{company_name}_news.txt"
324
  articles = []
325
+ article_count = 0
326
  article_limit = 10
327
+ print(f"πŸš€ Starting parallel fetching for company: {company_name}...")
328
  article_queue = queue.Queue()
329
  article_limit_reached = threading.Event()
330
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as fetch_executor:
331
+ fetch_futures = [fetch_executor.submit(fetch_articles_from_rss, url, company_name, article_queue, article_limit_reached)
332
+ for url in rss_feeds]
333
+ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as process_executor:
 
 
 
 
 
 
 
 
 
334
  processing_futures = []
335
+ while article_count < article_limit and (not article_queue.empty() or not all(f.done() for f in fetch_futures)):
 
336
  try:
337
+ article_data = article_queue.get(timeout=0.1)
338
  future = process_executor.submit(process_article_content, article_data)
339
  processing_futures.append(future)
 
 
 
 
 
 
340
  except queue.Empty:
341
+ continue
 
 
 
 
 
 
 
342
  for future in concurrent.futures.as_completed(processing_futures):
343
+ if article_count >= article_limit:
344
+ article_limit_reached.set()
345
+ break
346
  result = future.result()
347
+ if result:
348
  articles.append(result)
349
+ article_count += 1
350
+ print(f"πŸ“Š Processed {article_count}/{article_limit} articles")
351
+ if article_count >= article_limit:
352
+ article_limit_reached.set()
353
+ print(f"βœ… Reached article limit of {article_limit}. Stopping search.")
354
+ break
355
  articles = articles[:article_limit]
356
  if not articles:
357
+ print(f"❌ No relevant articles found for company: {company_name}")
358
  return None
359
+ print(f"βœ… Saving {len(articles)} articles to {file_name}")
 
360
  analysis_result = comparative_analysis(articles, company_name)
361
  coverage_differences = get_coverage_differences(articles, company_name)
 
 
 
 
 
 
 
 
362
  sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
363
  for article in articles:
364
  sentiment_distribution[article["sentiment"]] += 1
 
365
  formatted_articles = [{"Title": article["title"], "Summary": article["summary"],
366
  "Sentiment": article["sentiment"], "Topics": article["keywords"].split(", ")}
367
  for article in articles]
 
368
  output_data = {
369
  "Company": company_name,
370
  "Articles": formatted_articles,
 
376
  },
377
  "Final Sentiment Analysis": analysis_result['Final Sentiment Analysis']
378
  }
379
+ with open(file_name, "w", encoding="utf-8") as file:
380
+ file.write(f'"Company": "{output_data["Company"]}",\n')
381
+ file.write('"Articles": [\n')
382
+ for i, article in enumerate(output_data["Articles"]):
383
+ file.write('{\n')
384
+ file.write(f'"Title": "{article["Title"]}",\n')
385
+ file.write(f'"Summary": "{article["Summary"]}",\n')
386
+ file.write(f'"Sentiment": "{article["Sentiment"]}",\n')
387
+ file.write(f'"Topics": {article["Topics"]}\n')
388
+ file.write('}' + (',\n' if i < len(output_data["Articles"]) - 1 else '\n'))
389
+ file.write('],\n')
390
+ file.write('"Comparative Sentiment Score": {\n')
391
+ file.write('"Sentiment Distribution": {\n')
392
+ for i, (sentiment, count) in enumerate(output_data["Comparative Sentiment Score"]["Sentiment Distribution"].items()):
393
+ file.write(f'"{sentiment}": {count}' + (',' if i < 2 else '') + '\n')
394
+ file.write('}\n')
395
+ file.write('},\n')
396
+ file.write(f'{output_data["Coverage Differences"]},\n')
397
+ file.write('"Topic Overlap": {\n')
398
+ file.write(f'"Common Topics": {output_data["Topic Overlap"]["Common Topics"]},\n')
399
+ for i, (key, value) in enumerate([(k, v) for k, v in output_data["Topic Overlap"].items() if k != "Common Topics"]):
400
+ file.write(f'"{key}": {value}' + (',\n' if i < len(output_data["Topic Overlap"]) - 2 else '\n'))
401
+ file.write('},\n')
402
+ file.write(f'"Final Sentiment Analysis": "{output_data["Final Sentiment Analysis"]}"\n')
403
+ print("\nOutput format:")
404
+ with open(file_name, "r", encoding="utf-8") as file:
405
+ print(file.read())
406
+ print("βœ… File saved successfully!")
407
+ return file_name
408
 
409
  if __name__ == "__main__":
410
  company_name = input("Enter company name to search for (e.g., Tesla): ")