Sirapatrwan commited on
Commit
abf89f6
·
verified ·
1 Parent(s): 8b2b5cf

Upload utils.py

Browse files
Files changed (1) hide show
  1. utils.py +166 -0
utils.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from transformers import pipeline
4
+ import gtts
5
+ import io
6
+ import os
7
+ from tts import TextToSpeechConverter
8
+ from datetime import datetime
9
+ import xml.etree.ElementTree as ET
10
+ from fake_useragent import UserAgent
11
+ import locale
12
+
13
+ news_topics = {
14
+ "Technology": ["tech", "digital", "software", "hardware", "IT"],
15
+ "AI": ["artificial intelligence", "machine learning", "deep learning", "neural network"],
16
+ "Business": ["company", "corporate", "firm", "enterprise", "startup", "market"],
17
+ "Finance": ["finance", "investment", "stock", "economy", "trading", "bank"],
18
+ "Partnership": ["partner", "collaboration", "alliance", "merger", "acquisition"],
19
+ "Social Media": ["social", "platform", "tweet", "facebook", "instagram", "linkedin", "post"],
20
+ "Innovation": ["innovate", "new", "advance", "breakthrough", "disruption"],
21
+ "Outage": ["outage", "downtime", "disrupt", "service interruption"],
22
+ "Launch": ["launch", "release", "introduce", "unveil"],
23
+ "Publicity": ["public", "campaign", "promo", "advertisement"],
24
+ "Privacy": ["privacy", "data", "security", "breach"],
25
+ "Entertainment": ["entertain", "media", "show", "movie", "series"],
26
+ "Leadership": ["promotion", "leader", "executive", "ceo", "chairman", "manager"],
27
+ "Mergers & Acquisitions": ["merger", "acquisition", "buyout", "takeover"]
28
+ }
29
+
30
+ def fetch_news(company, language=None, region=None):
31
+ base_url = "https://news.google.com/rss/search"
32
+ language = language or locale.getdefaultlocale()[0].replace('_', '-').lower() or 'en-US'
33
+ region = region or 'US'
34
+ params = {
35
+ "q": f'"{company}"',
36
+ "hl": language,
37
+ "gl": region,
38
+ "ceid": f"{region}:{language.split('-')[0]}"
39
+ }
40
+ headers = {"User-Agent": UserAgent().random, "Accept": "application/xml"}
41
+ print(f"Fetching news for {company} with URL: {base_url}?{'&'.join(f'{k}={v}' for k, v in params.items())}")
42
+ try:
43
+ response = requests.get(base_url, headers=headers, params=params, timeout=15)
44
+ print(f"Response status for {company}: {response.status_code}")
45
+ response.raise_for_status()
46
+ soup = BeautifulSoup(response.content, features="xml")
47
+ if not soup:
48
+ print("Error: BeautifulSoup returned None. Falling back to ElementTree.")
49
+ return parse_with_elementtree(response.content, company)
50
+ items = soup.find_all("item")[:10]
51
+ if not items:
52
+ print(f"No news items found in the RSS feed for {company} with BeautifulSoup.")
53
+ return parse_with_elementtree(response.content, company)
54
+ print(f"Found {len(items)} items with BeautifulSoup.")
55
+ articles = []
56
+ for item in items:
57
+ title = getattr(item.title, 'text', "No title") if item.title else "No title"
58
+ desc = getattr(item.description, 'text', title) if item.description else title
59
+ link = item.link.next_sibling.strip() if item.link and item.link.next_sibling else "No link"
60
+ raw_date = getattr(item.pubDate, 'text', "Date not available") if item.pubDate else "Date not available"
61
+ try:
62
+ pub_date = datetime.strptime(raw_date, "%a, %d %b %Y %H:%M:%S %Z").strftime("%a, %d %b %Y")
63
+ except ValueError:
64
+ pub_date = "Date not available"
65
+ desc_soup = BeautifulSoup(desc, "html.parser")
66
+ full_text = desc_soup.get_text(separator=" ").strip()
67
+ summary = full_text.replace(title, "").strip()
68
+ summary_words = summary.split()
69
+ source = title.split(" - ")[-1].strip() if " - " in title else "Unknown Source"
70
+ final_summary = " ".join(summary_words[:80]) + f" - {source}" if len(summary_words) > 10 else f"{title} - {source}"
71
+ articles.append({
72
+ "title": title,
73
+ "summary": final_summary,
74
+ "link": link,
75
+ "pub_date": pub_date
76
+ })
77
+ print(f"Successfully fetched {len(articles)} articles for {company} with BeautifulSoup")
78
+ return articles
79
+ except requests.exceptions.RequestException as e:
80
+ print(f"Request failed for {company}: {str(e)}")
81
+ return []
82
+ except Exception as e:
83
+ print(f"Error processing news for {company} with BeautifulSoup: {str(e)}. Falling back to ElementTree.")
84
+ return parse_with_elementtree(response.content, company)
85
+
86
+ def parse_with_elementtree(content, company):
87
+ print("Attempting to parse with ElementTree...")
88
+ try:
89
+ root = ET.fromstring(content)
90
+ items = root.findall(".//item")[:10]
91
+ if not items:
92
+ print(f"No news items found in the RSS feed for {company} with ElementTree")
93
+ return []
94
+ articles = []
95
+ for item in items:
96
+ title = item.find("title").text if item.find("title") is not None else "No title"
97
+ desc = item.find("description").text if item.find("description") is not None else title
98
+ link = item.find("link").text if item.find("link") is not None else "No link"
99
+ raw_date = item.find("pubDate").text if item.find("pubDate") is not None else "Date not available"
100
+ try:
101
+ pub_date = datetime.strptime(raw_date, "%a, %d %b %Y %H:%M:%S %Z").strftime("%a, %d %b %Y")
102
+ except ValueError:
103
+ pub_date = "Date not available"
104
+ desc_soup = BeautifulSoup(desc, "html.parser")
105
+ full_text = desc_soup.get_text(separator=" ").strip()
106
+ summary = full_text if full_text else title
107
+ summary_words = summary.split()
108
+ source = title.split(" - ")[-1].strip() if " - " in title else "Unknown Source"
109
+ final_summary = " ".join(summary_words[:80]) + f" - {source}" if len(summary_words) > 10 else f"{title} - {source}"
110
+ articles.append({
111
+ "title": title,
112
+ "summary": final_summary,
113
+ "link": link,
114
+ "pub_date": pub_date
115
+ })
116
+ print(f"Successfully fetched {len(articles)} articles for {company} with ElementTree")
117
+ return articles
118
+ except Exception as e:
119
+ print(f"Error processing news for {company} with ElementTree: {str(e)}")
120
+ return []
121
+
122
+ sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
123
+
124
+ def analyze_sentiment(text):
125
+ try:
126
+ result = sentiment_analyzer(text[:512])[0]
127
+ score = result["score"]
128
+ label = result["label"]
129
+ if score < 0.7 or "how to" in text.lower() or "review" in text.lower():
130
+ return "Neutral"
131
+ return "Positive" if label == "POSITIVE" else "Negative"
132
+ except Exception as e:
133
+ print(f"Sentiment analysis error: {e}")
134
+ return "Neutral"
135
+
136
+ def extract_topics(text, max_topics=2):
137
+ text_lower = text.lower()
138
+ topic_scores = {}
139
+ for topic, keywords in news_topics.items():
140
+ count = sum(text_lower.count(keyword.lower()) for keyword in keywords)
141
+ if count > 0:
142
+ topic_scores[topic] = count
143
+ sorted_topics = sorted(topic_scores.items(), key=lambda x: x[1], reverse=True)
144
+ return [topic for topic, _ in sorted_topics][:max_topics] if sorted_topics else ["General News"]
145
+
146
+ tts_converter = TextToSpeechConverter()
147
+
148
+ def generate_tts(text, language='hi'):
149
+ try:
150
+ if language == 'hi':
151
+ result = tts_converter.generate_speech(text)
152
+ if result["success"]:
153
+ print(f"Hindi audio generated in memory")
154
+ return result["audio_buffer"]
155
+ else:
156
+ print(f"Hindi audio error: {result['message']}")
157
+ return None
158
+ else:
159
+ tts = gtts.gTTS(text=text, lang='en', slow=False)
160
+ audio_buffer = io.BytesIO()
161
+ tts.write_to_fp(audio_buffer)
162
+ audio_buffer.seek(0)
163
+ return audio_buffer
164
+ except Exception as e:
165
+ print(f"Audio generation error for {language}: {str(e)}")
166
+ return None