Spaces:
Sleeping
Sleeping
Sasmita Harini commited on
Commit ·
5b0e184
1
Parent(s): 1c7a511
Run FastAPI as subprocess in app.py
Browse files
app.py
CHANGED
|
@@ -6,12 +6,12 @@ import tempfile
|
|
| 6 |
import re
|
| 7 |
from deep_translator import GoogleTranslator
|
| 8 |
|
| 9 |
-
st.title("News Summarization and Text-to-Speech Application")
|
| 10 |
|
| 11 |
-
# User input for company name
|
| 12 |
company_name = st.text_input("Enter the company name:", "").strip().lower()
|
| 13 |
|
| 14 |
-
if st.button("Fetch News"):
|
| 15 |
if company_name:
|
| 16 |
# Run news extraction and analysis
|
| 17 |
st.write(f"Fetching news for **{company_name}**...")
|
|
@@ -61,6 +61,7 @@ if st.button("Fetch News"):
|
|
| 61 |
tts.save(temp_audio_file.name)
|
| 62 |
|
| 63 |
|
|
|
|
| 64 |
# Provide download button for the audio
|
| 65 |
with open(temp_audio_file.name, "rb") as audio_file:
|
| 66 |
audio_data = audio_file.read()
|
|
@@ -77,4 +78,5 @@ if st.button("Fetch News"):
|
|
| 77 |
else:
|
| 78 |
st.error("No relevant news articles found.")
|
| 79 |
else:
|
| 80 |
-
st.warning("Please enter a company name.")
|
|
|
|
|
|
| 6 |
import re
|
| 7 |
from deep_translator import GoogleTranslator
|
| 8 |
|
| 9 |
+
st.title("News Summarization and Text-to-Speech Application")
|
| 10 |
|
| 11 |
+
# User input for company name
|
| 12 |
company_name = st.text_input("Enter the company name:", "").strip().lower()
|
| 13 |
|
| 14 |
+
if st.button("Fetch News"):
|
| 15 |
if company_name:
|
| 16 |
# Run news extraction and analysis
|
| 17 |
st.write(f"Fetching news for **{company_name}**...")
|
|
|
|
| 61 |
tts.save(temp_audio_file.name)
|
| 62 |
|
| 63 |
|
| 64 |
+
|
| 65 |
# Provide download button for the audio
|
| 66 |
with open(temp_audio_file.name, "rb") as audio_file:
|
| 67 |
audio_data = audio_file.read()
|
|
|
|
| 78 |
else:
|
| 79 |
st.error("No relevant news articles found.")
|
| 80 |
else:
|
| 81 |
+
st.warning("Please enter a company name.")
|
| 82 |
+
requirements.txt
|
utils.py
CHANGED
|
@@ -14,18 +14,18 @@ from groq import Groq
|
|
| 14 |
import json
|
| 15 |
import re
|
| 16 |
|
| 17 |
-
nltk.download('vader_lexicon')
|
| 18 |
|
| 19 |
-
# Initialize sentiment analyzer
|
| 20 |
sid = SentimentIntensityAnalyzer()
|
| 21 |
|
| 22 |
-
# Load models once
|
| 23 |
tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
| 24 |
model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
| 25 |
sentiment_analyzer = pipeline("sentiment-analysis")
|
| 26 |
kw_model = KeyBERT()
|
| 27 |
|
| 28 |
-
# Load spaCy model
|
| 29 |
try:
|
| 30 |
nlp = spacy.load("en_core_web_md")
|
| 31 |
except OSError:
|
|
@@ -34,10 +34,10 @@ except OSError:
|
|
| 34 |
spacy.cli.download("en_core_web_md")
|
| 35 |
nlp = spacy.load("en_core_web_md")
|
| 36 |
|
| 37 |
-
# Initialize Groq client
|
| 38 |
client = Groq(api_key="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")
|
| 39 |
|
| 40 |
-
# RSS Feeds
|
| 41 |
rss_feeds = [
|
| 42 |
# Technology-focused feeds (general tech news, some may cover Visa tech initiatives)
|
| 43 |
"https://feeds.bbci.co.uk/news/technology/rss.xml", # BBC Technology
|
|
@@ -56,7 +56,7 @@ rss_feeds = [
|
|
| 56 |
"https://www.pcworld.com/feed", # PCWorld
|
| 57 |
"https://venturebeat.com/feed/", # VentureBeat
|
| 58 |
|
| 59 |
-
|
| 60 |
"https://feeds.bbci.co.uk/news/business/rss.xml", # BBC Business
|
| 61 |
"https://www.cnbc.com/id/10001147/device/rss/rss.html", # CNBC Business
|
| 62 |
"https://www.economist.com/business/rss.xml", # The Economist Business
|
|
@@ -71,7 +71,7 @@ rss_feeds = [
|
|
| 71 |
"https://www.marketwatch.com/rss/topstories", # MarketWatch Top Stories
|
| 72 |
"https://www.investing.com/rss/news.rss", # Investing.com News
|
| 73 |
|
| 74 |
-
|
| 75 |
"https://feeds.bbci.co.uk/news/rss.xml", # BBC News
|
| 76 |
"https://www.aljazeera.com/xml/rss/all.xml", # Al Jazeera
|
| 77 |
"https://www.theguardian.com/world/rss", # The Guardian World
|
|
@@ -81,16 +81,16 @@ rss_feeds = [
|
|
| 81 |
"https://feeds.washingtonpost.com/rss/business", # Washington Post Business
|
| 82 |
]
|
| 83 |
|
| 84 |
-
headers = {
|
| 85 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
|
| 86 |
}
|
| 87 |
|
| 88 |
-
# Locks for thread safety
|
| 89 |
model_lock = threading.Lock()
|
| 90 |
sentiment_lock = threading.Lock()
|
| 91 |
keyword_lock = threading.Lock()
|
| 92 |
|
| 93 |
-
def summarize_t5(text, max_length=100, min_length=30):
|
| 94 |
with model_lock:
|
| 95 |
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
|
| 96 |
summary_ids = model.generate(
|
|
@@ -103,17 +103,17 @@ def summarize_t5(text, max_length=100, min_length=30):
|
|
| 103 |
)
|
| 104 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
| 105 |
|
| 106 |
-
def analyze_sentiment(text):
|
| 107 |
with sentiment_lock:
|
| 108 |
result = sentiment_analyzer(text[:512])[0]
|
| 109 |
label = result["label"].lower()
|
| 110 |
return "Positive" if label == "positive" else "Negative" if label == "negative" else "Neutral"
|
| 111 |
|
| 112 |
-
def extract_keywords(text):
|
| 113 |
with keyword_lock:
|
| 114 |
return ", ".join([kw[0] for kw in kw_model.extract_keywords(text, top_n=5)])
|
| 115 |
|
| 116 |
-
def process_article_content(article_data):
|
| 117 |
try:
|
| 118 |
title, link, content, company_name = article_data
|
| 119 |
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
|
@@ -134,7 +134,7 @@ def process_article_content(article_data):
|
|
| 134 |
print(f"❌ Error processing article {title}: {e}")
|
| 135 |
return None
|
| 136 |
|
| 137 |
-
def fetch_article_content(article_info, company_name, article_limit_reached):
|
| 138 |
title, link, description = article_info
|
| 139 |
try:
|
| 140 |
if article_limit_reached.is_set():
|
|
@@ -151,7 +151,7 @@ def fetch_article_content(article_info, company_name, article_limit_reached):
|
|
| 151 |
print(f"❌ Failed to retrieve content for: {title} - {e}")
|
| 152 |
return None
|
| 153 |
|
| 154 |
-
def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_reached):
|
| 155 |
try:
|
| 156 |
if article_limit_reached.is_set():
|
| 157 |
return
|
|
@@ -175,7 +175,7 @@ def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_
|
|
| 175 |
except requests.RequestException as e:
|
| 176 |
print(f"❌ Failed to fetch RSS feed: {rss_url} - {e}")
|
| 177 |
|
| 178 |
-
def get_coverage_differences(articles, company_name):
|
| 179 |
"""Fetch coverage differences using Groq API."""
|
| 180 |
articles_summary = "\n".join([f"Article {i+1}: Title: {a['title']}, Summary: {a['summary']}, Sentiment: {a['sentiment']}, Keywords: {a['keywords']}"
|
| 181 |
for i, a in enumerate(articles)])
|
|
@@ -185,10 +185,10 @@ def get_coverage_differences(articles, company_name):
|
|
| 185 |
2. Identify coverage differences between positive and negative articles.
|
| 186 |
3. Provide insights into how these differences impact {company_name}'s market, mentioning article numbers clearly.
|
| 187 |
|
| 188 |
-
|
| 189 |
{articles_summary}
|
| 190 |
|
| 191 |
-
|
| 192 |
{{
|
| 193 |
"Coverage Differences": [
|
| 194 |
{{
|
|
@@ -213,7 +213,7 @@ def get_coverage_differences(articles, company_name):
|
|
| 213 |
coverage_diff += chunk.choices[0].delta.content or ""
|
| 214 |
|
| 215 |
text = coverage_diff.strip()
|
| 216 |
-
pattern = r'json\s*([\s\S]*?)\s*'
|
| 217 |
match = re.search(pattern, text)
|
| 218 |
|
| 219 |
if match:
|
|
@@ -225,11 +225,11 @@ def get_coverage_differences(articles, company_name):
|
|
| 225 |
except json.JSONDecodeError as e:
|
| 226 |
return f"Error: Invalid JSON format - {str(e)}"
|
| 227 |
else:
|
| 228 |
-
return "Error: No JSON content found between json and markers"
|
| 229 |
except Exception as e:
|
| 230 |
return f"Error in Groq API call: {str(e)}"
|
| 231 |
|
| 232 |
-
def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
|
| 233 |
keyword_clusters = defaultdict(list)
|
| 234 |
for article in processed_articles:
|
| 235 |
keywords = article["keywords"].split(", ")
|
|
@@ -269,7 +269,7 @@ def similarity_based_common_topics(processed_articles, similarity_threshold=0.8,
|
|
| 269 |
final_common_topics.append(topic)
|
| 270 |
return final_common_topics
|
| 271 |
|
| 272 |
-
def comparative_analysis(processed_articles, company_name):
|
| 273 |
sentiment_summary = {"Positive": 0, "Negative": 0, "Neutral": 0}
|
| 274 |
all_keywords = []
|
| 275 |
for idx, article in enumerate(processed_articles):
|
|
@@ -314,7 +314,7 @@ def comparative_analysis(processed_articles, company_name):
|
|
| 314 |
"Final Sentiment Analysis": sentiment_statement
|
| 315 |
}
|
| 316 |
|
| 317 |
-
def fetch_and_save_news(company_name):
|
| 318 |
if not company_name:
|
| 319 |
print("❌ Error: Company name is required")
|
| 320 |
return None
|
|
@@ -404,6 +404,6 @@ def fetch_and_save_news(company_name):
|
|
| 404 |
print("✅ File saved successfully!")
|
| 405 |
return file_name
|
| 406 |
|
| 407 |
-
if __name__ == "__main__":
|
| 408 |
company_name = input("Enter company name to search for (e.g., Tesla): ")
|
| 409 |
fetch_and_save_news(company_name)
|
|
|
|
| 14 |
import json
|
| 15 |
import re
|
| 16 |
|
| 17 |
+
nltk.download('vader_lexicon')
|
| 18 |
|
| 19 |
+
# Initialize sentiment analyzer
|
| 20 |
sid = SentimentIntensityAnalyzer()
|
| 21 |
|
| 22 |
+
# Load models once
|
| 23 |
tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
| 24 |
model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
| 25 |
sentiment_analyzer = pipeline("sentiment-analysis")
|
| 26 |
kw_model = KeyBERT()
|
| 27 |
|
| 28 |
+
# Load spaCy model
|
| 29 |
try:
|
| 30 |
nlp = spacy.load("en_core_web_md")
|
| 31 |
except OSError:
|
|
|
|
| 34 |
spacy.cli.download("en_core_web_md")
|
| 35 |
nlp = spacy.load("en_core_web_md")
|
| 36 |
|
| 37 |
+
# Initialize Groq client
|
| 38 |
client = Groq(api_key="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")
|
| 39 |
|
| 40 |
+
# RSS Feeds
|
| 41 |
rss_feeds = [
|
| 42 |
# Technology-focused feeds (general tech news, some may cover Visa tech initiatives)
|
| 43 |
"https://feeds.bbci.co.uk/news/technology/rss.xml", # BBC Technology
|
|
|
|
| 56 |
"https://www.pcworld.com/feed", # PCWorld
|
| 57 |
"https://venturebeat.com/feed/", # VentureBeat
|
| 58 |
|
| 59 |
+
# Business and Finance feeds (more likely to cover Visa)
|
| 60 |
"https://feeds.bbci.co.uk/news/business/rss.xml", # BBC Business
|
| 61 |
"https://www.cnbc.com/id/10001147/device/rss/rss.html", # CNBC Business
|
| 62 |
"https://www.economist.com/business/rss.xml", # The Economist Business
|
|
|
|
| 71 |
"https://www.marketwatch.com/rss/topstories", # MarketWatch Top Stories
|
| 72 |
"https://www.investing.com/rss/news.rss", # Investing.com News
|
| 73 |
|
| 74 |
+
# General news (reliable sources that may cover Visa)
|
| 75 |
"https://feeds.bbci.co.uk/news/rss.xml", # BBC News
|
| 76 |
"https://www.aljazeera.com/xml/rss/all.xml", # Al Jazeera
|
| 77 |
"https://www.theguardian.com/world/rss", # The Guardian World
|
|
|
|
| 81 |
"https://feeds.washingtonpost.com/rss/business", # Washington Post Business
|
| 82 |
]
|
| 83 |
|
| 84 |
+
headers = {
|
| 85 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
|
| 86 |
}
|
| 87 |
|
| 88 |
+
# Locks for thread safety
|
| 89 |
model_lock = threading.Lock()
|
| 90 |
sentiment_lock = threading.Lock()
|
| 91 |
keyword_lock = threading.Lock()
|
| 92 |
|
| 93 |
+
def summarize_t5(text, max_length=100, min_length=30):
|
| 94 |
with model_lock:
|
| 95 |
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
|
| 96 |
summary_ids = model.generate(
|
|
|
|
| 103 |
)
|
| 104 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
| 105 |
|
| 106 |
+
def analyze_sentiment(text):
|
| 107 |
with sentiment_lock:
|
| 108 |
result = sentiment_analyzer(text[:512])[0]
|
| 109 |
label = result["label"].lower()
|
| 110 |
return "Positive" if label == "positive" else "Negative" if label == "negative" else "Neutral"
|
| 111 |
|
| 112 |
+
def extract_keywords(text):
|
| 113 |
with keyword_lock:
|
| 114 |
return ", ".join([kw[0] for kw in kw_model.extract_keywords(text, top_n=5)])
|
| 115 |
|
| 116 |
+
def process_article_content(article_data):
|
| 117 |
try:
|
| 118 |
title, link, content, company_name = article_data
|
| 119 |
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
|
|
|
| 134 |
print(f"❌ Error processing article {title}: {e}")
|
| 135 |
return None
|
| 136 |
|
| 137 |
+
def fetch_article_content(article_info, company_name, article_limit_reached):
|
| 138 |
title, link, description = article_info
|
| 139 |
try:
|
| 140 |
if article_limit_reached.is_set():
|
|
|
|
| 151 |
print(f"❌ Failed to retrieve content for: {title} - {e}")
|
| 152 |
return None
|
| 153 |
|
| 154 |
+
def fetch_articles_from_rss(rss_url, company_name, article_queue, article_limit_reached):
|
| 155 |
try:
|
| 156 |
if article_limit_reached.is_set():
|
| 157 |
return
|
|
|
|
| 175 |
except requests.RequestException as e:
|
| 176 |
print(f"❌ Failed to fetch RSS feed: {rss_url} - {e}")
|
| 177 |
|
| 178 |
+
def get_coverage_differences(articles, company_name):
|
| 179 |
"""Fetch coverage differences using Groq API."""
|
| 180 |
articles_summary = "\n".join([f"Article {i+1}: Title: {a['title']}, Summary: {a['summary']}, Sentiment: {a['sentiment']}, Keywords: {a['keywords']}"
|
| 181 |
for i, a in enumerate(articles)])
|
|
|
|
| 185 |
2. Identify coverage differences between positive and negative articles.
|
| 186 |
3. Provide insights into how these differences impact {company_name}'s market, mentioning article numbers clearly.
|
| 187 |
|
| 188 |
+
Articles:
|
| 189 |
{articles_summary}
|
| 190 |
|
| 191 |
+
Generate a JSON output in the following format:
|
| 192 |
{{
|
| 193 |
"Coverage Differences": [
|
| 194 |
{{
|
|
|
|
| 213 |
coverage_diff += chunk.choices[0].delta.content or ""
|
| 214 |
|
| 215 |
text = coverage_diff.strip()
|
| 216 |
+
pattern = r'```json\s*([\s\S]*?)\s*```'
|
| 217 |
match = re.search(pattern, text)
|
| 218 |
|
| 219 |
if match:
|
|
|
|
| 225 |
except json.JSONDecodeError as e:
|
| 226 |
return f"Error: Invalid JSON format - {str(e)}"
|
| 227 |
else:
|
| 228 |
+
return "Error: No JSON content found between ```json and ``` markers"
|
| 229 |
except Exception as e:
|
| 230 |
return f"Error in Groq API call: {str(e)}"
|
| 231 |
|
| 232 |
+
def similarity_based_common_topics(processed_articles, similarity_threshold=0.8, min_articles=2):
|
| 233 |
keyword_clusters = defaultdict(list)
|
| 234 |
for article in processed_articles:
|
| 235 |
keywords = article["keywords"].split(", ")
|
|
|
|
| 269 |
final_common_topics.append(topic)
|
| 270 |
return final_common_topics
|
| 271 |
|
| 272 |
+
def comparative_analysis(processed_articles, company_name):
|
| 273 |
sentiment_summary = {"Positive": 0, "Negative": 0, "Neutral": 0}
|
| 274 |
all_keywords = []
|
| 275 |
for idx, article in enumerate(processed_articles):
|
|
|
|
| 314 |
"Final Sentiment Analysis": sentiment_statement
|
| 315 |
}
|
| 316 |
|
| 317 |
+
def fetch_and_save_news(company_name):
|
| 318 |
if not company_name:
|
| 319 |
print("❌ Error: Company name is required")
|
| 320 |
return None
|
|
|
|
| 404 |
print("✅ File saved successfully!")
|
| 405 |
return file_name
|
| 406 |
|
| 407 |
+
if __name__ == "__main__":
|
| 408 |
company_name = input("Enter company name to search for (e.g., Tesla): ")
|
| 409 |
fetch_and_save_news(company_name)
|