Sasmita Harini commited on
Commit
5b0e184
·
1 Parent(s): 1c7a511

Run FastAPI as subprocess in app.py

Browse files
Files changed (2) hide show
  1. app.py +6 -4
  2. utils.py +25 -25
app.py CHANGED
@@ -6,12 +6,12 @@ import tempfile
6
  import re
7
  from deep_translator import GoogleTranslator
8
 
9
- st.title("News Summarization and Text-to-Speech Application")
10
 
11
- # User input for company name
12
  company_name = st.text_input("Enter the company name:", "").strip().lower()
13
 
14
- if st.button("Fetch News"):
15
  if company_name:
16
  # Run news extraction and analysis
17
  st.write(f"Fetching news for **{company_name}**...")
@@ -61,6 +61,7 @@ if st.button("Fetch News"):
61
  tts.save(temp_audio_file.name)
62
 
63
 
 
64
  # Provide download button for the audio
65
  with open(temp_audio_file.name, "rb") as audio_file:
66
  audio_data = audio_file.read()
@@ -77,4 +78,5 @@ if st.button("Fetch News"):
77
  else:
78
  st.error("No relevant news articles found.")
79
  else:
80
- st.warning("Please enter a company name.")
 
 
6
  import re
7
  from deep_translator import GoogleTranslator
8
 
9
+ st.title("News Summarization and Text-to-Speech Application")
10
 
11
+ # User input for company name
12
  company_name = st.text_input("Enter the company name:", "").strip().lower()
13
 
14
+ if st.button("Fetch News"):
15
  if company_name:
16
  # Run news extraction and analysis
17
  st.write(f"Fetching news for **{company_name}**...")
 
61
  tts.save(temp_audio_file.name)
62
 
63
 
64
+
65
  # Provide download button for the audio
66
  with open(temp_audio_file.name, "rb") as audio_file:
67
  audio_data = audio_file.read()
 
78
  else:
79
  st.error("No relevant news articles found.")
80
  else:
81
+ st.warning("Please enter a company name.")
82
+ requirements.txt
utils.py CHANGED
@@ -14,18 +14,18 @@ from groq import Groq
14
  import json
15
  import re
16
 
17
- nltk.download('vader_lexicon')
18
 
19
- # Initialize sentiment analyzer
20
  sid = SentimentIntensityAnalyzer()
21
 
22
- # Load models once
23
  tokenizer = T5Tokenizer.from_pretrained("t5-small")
24
  model = T5ForConditionalGeneration.from_pretrained("t5-small")
25
  sentiment_analyzer = pipeline("sentiment-analysis")
26
  kw_model = KeyBERT()
27
 
28
- # Load spaCy model
29
  try:
30
  nlp = spacy.load("en_core_web_md")
31
  except OSError:
@@ -34,10 +34,10 @@ except OSError:
34
  spacy.cli.download("en_core_web_md")
35
  nlp = spacy.load("en_core_web_md")
36
 
37
- # Initialize Groq client
38
  client = Groq(api_key="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")
39
 
40
- # RSS Feeds
41
  rss_feeds = [
42
  # Technology-focused feeds (general tech news, some may cover Visa tech initiatives)
43
  "https://feeds.bbci.co.uk/news/technology/rss.xml", # BBC Technology
@@ -56,7 +56,7 @@ rss_feeds = [
56
  "https://www.pcworld.com/feed", # PCWorld
57
  "https://venturebeat.com/feed/", # VentureBeat
58
 
59
- # Business and Finance feeds (more likely to cover Visa)
60
  "https://feeds.bbci.co.uk/news/business/rss.xml", # BBC Business
61
  "https://www.cnbc.com/id/10001147/device/rss/rss.html", # CNBC Business
62
  "https://www.economist.com/business/rss.xml", # The Economist Business
@@ -71,7 +71,7 @@ rss_feeds = [
71
  "https://www.marketwatch.com/rss/topstories", # MarketWatch Top Stories
72
  "https://www.investing.com/rss/news.rss", # Investing.com News
73
 
74
- # General news (reliable sources that may cover Visa)
75
  "https://feeds.bbci.co.uk/news/rss.xml", # BBC News
76
  "https://www.aljazeera.com/xml/rss/all.xml", # Al Jazeera
77
  "https://www.theguardian.com/world/rss", # The Guardian World
@@ -81,16 +81,16 @@ rss_feeds = [
81
  "https://feeds.washingtonpost.com/rss/business", # Washington Post Business
82
  ]
83
 
84
- headers = {
85
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
86
  }
87
 
88
- # Locks for thread safety
89
  model_lock = threading.Lock()
90
  sentiment_lock = threading.Lock()
91
  keyword_lock = threading.Lock()
92
 
93
- def summarize_t5(text, max_length=100, min_length=30):
94
  with model_lock:
95
  inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
96
  summary_ids = model.generate(
@@ -103,17 +103,17 @@ def summarize_t5(text, max_length=100, min_length=30):
103
  )
104
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
105
 
106
- def analyze_sentiment(text):
107
  with sentiment_lock:
108
  result = sentiment_analyzer(text[:512])[0]
109
  label = result["label"].lower()
110
  return "Positive" if label == "positive" else "Negative" if label == "negative" else "Neutral"
111
 
112
- def extract_keywords(text):
113
  with keyword_lock:
114
  return ", ".join([kw[0] for kw in kw_model.extract_keywords(text, top_n=5)])
115
 
116
- def process_article_content(article_data):
117
  try:
118
  title, link, content, company_name = article_data
119
  with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
@@ -134,7 +134,7 @@ def process_article_content(article_data):
134
  print(f"❌ Error processing article {title}: {e}")
135
  return None
136
 
137
- def fetch_article_content(article_info, company_name, article_limit_reached):
138
  title, link, description = article_info
139
  try:
140
  if article_limit_reached.is_set():
@@ -151,7 +151,7 @@ def fetch_article_content(article_info, company_name, article_limit_reached):
151
  print(f"❌ Failed to retrieve content for: {title} - {e}")
152
  return None
153
 
154
- def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_reached):
155
  try:
156
  if article_limit_reached.is_set():
157
  return
@@ -175,7 +175,7 @@ def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_
175
  except requests.RequestException as e:
176
  print(f"❌ Failed to fetch RSS feed: {rss_url} - {e}")
177
 
178
- def get_coverage_differences(articles, company_name):
179
  """Fetch coverage differences using Groq API."""
180
  articles_summary = "\n".join([f"Article {i+1}: Title: {a['title']}, Summary: {a['summary']}, Sentiment: {a['sentiment']}, Keywords: {a['keywords']}"
181
  for i, a in enumerate(articles)])
@@ -185,10 +185,10 @@ def get_coverage_differences(articles, company_name):
185
  2. Identify coverage differences between positive and negative articles.
186
  3. Provide insights into how these differences impact {company_name}'s market, mentioning article numbers clearly.
187
 
188
- Articles:
189
  {articles_summary}
190
 
191
- Generate a JSON output in the following format:
192
  {{
193
  "Coverage Differences": [
194
  {{
@@ -213,7 +213,7 @@ def get_coverage_differences(articles, company_name):
213
  coverage_diff += chunk.choices[0].delta.content or ""
214
 
215
  text = coverage_diff.strip()
216
- pattern = r'json\s*([\s\S]*?)\s*'
217
  match = re.search(pattern, text)
218
 
219
  if match:
@@ -225,11 +225,11 @@ def get_coverage_differences(articles, company_name):
225
  except json.JSONDecodeError as e:
226
  return f"Error: Invalid JSON format - {str(e)}"
227
  else:
228
- return "Error: No JSON content found between json and markers"
229
  except Exception as e:
230
  return f"Error in Groq API call: {str(e)}"
231
 
232
- def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
233
  keyword_clusters = defaultdict(list)
234
  for article in processed_articles:
235
  keywords = article["keywords"].split(", ")
@@ -269,7 +269,7 @@ def similarity_based_common_topics(processed_articles, similarity_threshold=0.8,
269
  final_common_topics.append(topic)
270
  return final_common_topics
271
 
272
- def comparative_analysis(processed_articles, company_name):
273
  sentiment_summary = {"Positive": 0, "Negative": 0, "Neutral": 0}
274
  all_keywords = []
275
  for idx, article in enumerate(processed_articles):
@@ -314,7 +314,7 @@ def comparative_analysis(processed_articles, company_name):
314
  "Final Sentiment Analysis": sentiment_statement
315
  }
316
 
317
- def fetch_and_save_news(company_name):
318
  if not company_name:
319
  print("❌ Error: Company name is required")
320
  return None
@@ -404,6 +404,6 @@ def fetch_and_save_news(company_name):
404
  print("✅ File saved successfully!")
405
  return file_name
406
 
407
- if __name__ == "__main__":
408
  company_name = input("Enter company name to search for (e.g., Tesla): ")
409
  fetch_and_save_news(company_name)
 
14
  import json
15
  import re
16
 
17
+ nltk.download('vader_lexicon')
18
 
19
+ # Initialize sentiment analyzer
20
  sid = SentimentIntensityAnalyzer()
21
 
22
+ # Load models once
23
  tokenizer = T5Tokenizer.from_pretrained("t5-small")
24
  model = T5ForConditionalGeneration.from_pretrained("t5-small")
25
  sentiment_analyzer = pipeline("sentiment-analysis")
26
  kw_model = KeyBERT()
27
 
28
+ # Load spaCy model
29
  try:
30
  nlp = spacy.load("en_core_web_md")
31
  except OSError:
 
34
  spacy.cli.download("en_core_web_md")
35
  nlp = spacy.load("en_core_web_md")
36
 
37
+ # Initialize Groq client
38
  client = Groq(api_key="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")
39
 
40
+ # RSS Feeds
41
  rss_feeds = [
42
  # Technology-focused feeds (general tech news, some may cover Visa tech initiatives)
43
  "https://feeds.bbci.co.uk/news/technology/rss.xml", # BBC Technology
 
56
  "https://www.pcworld.com/feed", # PCWorld
57
  "https://venturebeat.com/feed/", # VentureBeat
58
 
59
+ # Business and Finance feeds (more likely to cover Visa)
60
  "https://feeds.bbci.co.uk/news/business/rss.xml", # BBC Business
61
  "https://www.cnbc.com/id/10001147/device/rss/rss.html", # CNBC Business
62
  "https://www.economist.com/business/rss.xml", # The Economist Business
 
71
  "https://www.marketwatch.com/rss/topstories", # MarketWatch Top Stories
72
  "https://www.investing.com/rss/news.rss", # Investing.com News
73
 
74
+ # General news (reliable sources that may cover Visa)
75
  "https://feeds.bbci.co.uk/news/rss.xml", # BBC News
76
  "https://www.aljazeera.com/xml/rss/all.xml", # Al Jazeera
77
  "https://www.theguardian.com/world/rss", # The Guardian World
 
81
  "https://feeds.washingtonpost.com/rss/business", # Washington Post Business
82
  ]
83
 
84
+ headers = {
85
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
86
  }
87
 
88
+ # Locks for thread safety
89
  model_lock = threading.Lock()
90
  sentiment_lock = threading.Lock()
91
  keyword_lock = threading.Lock()
92
 
93
+ def summarize_t5(text, max_length=100, min_length=30):
94
  with model_lock:
95
  inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
96
  summary_ids = model.generate(
 
103
  )
104
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
105
 
106
+ def analyze_sentiment(text):
107
  with sentiment_lock:
108
  result = sentiment_analyzer(text[:512])[0]
109
  label = result["label"].lower()
110
  return "Positive" if label == "positive" else "Negative" if label == "negative" else "Neutral"
111
 
112
+ def extract_keywords(text):
113
  with keyword_lock:
114
  return ", ".join([kw[0] for kw in kw_model.extract_keywords(text, top_n=5)])
115
 
116
+ def process_article_content(article_data):
117
  try:
118
  title, link, content, company_name = article_data
119
  with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
 
134
  print(f"❌ Error processing article {title}: {e}")
135
  return None
136
 
137
+ def fetch_article_content(article_info, company_name, article_limit_reached):
138
  title, link, description = article_info
139
  try:
140
  if article_limit_reached.is_set():
 
151
  print(f"❌ Failed to retrieve content for: {title} - {e}")
152
  return None
153
 
154
+ def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_reached):
155
  try:
156
  if article_limit_reached.is_set():
157
  return
 
175
  except requests.RequestException as e:
176
  print(f"❌ Failed to fetch RSS feed: {rss_url} - {e}")
177
 
178
+ def get_coverage_differences(articles, company_name):
179
  """Fetch coverage differences using Groq API."""
180
  articles_summary = "\n".join([f"Article {i+1}: Title: {a['title']}, Summary: {a['summary']}, Sentiment: {a['sentiment']}, Keywords: {a['keywords']}"
181
  for i, a in enumerate(articles)])
 
185
  2. Identify coverage differences between positive and negative articles.
186
  3. Provide insights into how these differences impact {company_name}'s market, mentioning article numbers clearly.
187
 
188
+ Articles:
189
  {articles_summary}
190
 
191
+ Generate a JSON output in the following format:
192
  {{
193
  "Coverage Differences": [
194
  {{
 
213
  coverage_diff += chunk.choices[0].delta.content or ""
214
 
215
  text = coverage_diff.strip()
216
+ pattern = r'```json\s*([\s\S]*?)\s*```'
217
  match = re.search(pattern, text)
218
 
219
  if match:
 
225
  except json.JSONDecodeError as e:
226
  return f"Error: Invalid JSON format - {str(e)}"
227
  else:
228
+ return "Error: No JSON content found between ```json and ``` markers"
229
  except Exception as e:
230
  return f"Error in Groq API call: {str(e)}"
231
 
232
+ def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
233
  keyword_clusters = defaultdict(list)
234
  for article in processed_articles:
235
  keywords = article["keywords"].split(", ")
 
269
  final_common_topics.append(topic)
270
  return final_common_topics
271
 
272
+ def comparative_analysis(processed_articles, company_name):
273
  sentiment_summary = {"Positive": 0, "Negative": 0, "Neutral": 0}
274
  all_keywords = []
275
  for idx, article in enumerate(processed_articles):
 
314
  "Final Sentiment Analysis": sentiment_statement
315
  }
316
 
317
+ def fetch_and_save_news(company_name):
318
  if not company_name:
319
  print("❌ Error: Company name is required")
320
  return None
 
404
  print("✅ File saved successfully!")
405
  return file_name
406
 
407
+ if __name__ == "__main__":
408
  company_name = input("Enter company name to search for (e.g., Tesla): ")
409
  fetch_and_save_news(company_name)