jsemrau commited on
Commit
9e5dff8
·
1 Parent(s): 3e15bc5

added connector

Browse files
Files changed (1) hide show
  1. connector.py +445 -0
connector.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import time
4
+ from datetime import date, timedelta, timezone, datetime
5
+ import os
6
+ import pandas as pd
7
+ import numpy as np
8
+ import logging
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+ import urllib.parse
12
+ import dateutil.parser
13
+ from dateutil import parser as dateutil_parser
14
+ from tldextract import extract
15
+ from urllib.parse import quote_plus
16
+
17
+ from collections import defaultdict
18
+ from dotenv import load_dotenv
19
+
20
+ from GoogleNews import GoogleNews
21
+ import feedparser
22
+
23
+
24
+ # Your existing functions (unchanged)
25
+ def get_google_news(query="AI Agents", cutoff=1):
26
+ """Get Google News articles based on query"""
27
+ days = cutoff
28
+ language = 'en'
29
+ to_day = datetime.today().strftime('%m/%d/%Y')
30
+ from_day = (datetime.today() - timedelta(days=days)).strftime('%m/%d/%Y')
31
+ blackList=' -site:winbuzzer.com -site:x.com -site:threads.com -site:instagram.com -site:linkedin.com -site:facebook.com -site:tiktok.com -site:reddit.com -site:youtube.com -site:newser.com -site:adexchanger.com -india -crypto -blockchain -bitcoin -DeFi'
32
+ tQuery=query+blackList
33
+ str_div = []
34
+
35
+ print(f"Assembling news with cutoff {cutoff} for query: {str(tQuery)} ")
36
+ try:
37
+ googlenews = GoogleNews(start=from_day, end=to_day, lang=language)
38
+
39
+ googlenews.search(tQuery)
40
+
41
+ page1 = googlenews.result()
42
+ df = pd.DataFrame(page1)
43
+
44
+ time_cutoff = datetime.now() - timedelta(days=cutoff)
45
+
46
+ for index, row in df.iterrows():
47
+
48
+ try:
49
+ news_time = dateutil.parser.parse(str(row['datetime']))
50
+ if news_time >= time_cutoff:
51
+ domain = extract(row['link']).domain
52
+ str_a = row.to_dict()
53
+ str_a['datetime'] = str(news_time)
54
+ str_a.update({'domain': domain})
55
+ str_div.append(str_a)
56
+
57
+ else:
58
+ print(f" Skipping {news_time} > {time_cutoff}")
59
+ except Exception as inner_e:
60
+ print(f"Error parsing datetime for row {index}: {inner_e}")
61
+ continue
62
+
63
+ except Exception as e:
64
+ print("Error aggregating news " + str(e))
65
+
66
+ return str_div
67
+
68
+ def resolve_redirect(url):
69
+ try:
70
+ response = requests.head(url, allow_redirects=True, timeout=5)
71
+ return response.url
72
+ except Exception as e:
73
+ print(f"Redirect failed: {e}")
74
+ return url
75
+
76
+ def get_google_news_new(query="AI Agents", cutoff=1):
77
+ """Get Google News articles based on query using RSS feed, output similar to GoogleNews package"""
78
+ results = []
79
+
80
+ print("Assembling news for " + str(query))
81
+
82
+ try:
83
+ # Create RSS URL with proper encoding
84
+ encoded_query = query.replace(' ', '+')
85
+ url = f"https://news.google.com/rss/search?q={encoded_query}"
86
+
87
+ # Parse the RSS feed
88
+ feed = feedparser.parse(url)
89
+
90
+ # Set time cutoff
91
+ time_cutoff = datetime.now(timezone.utc) - timedelta(days=cutoff)
92
+
93
+ for entry in feed.entries:
94
+ try:
95
+ # Parse the published or updated date
96
+ if hasattr(entry, 'published'):
97
+ news_time = dateutil_parser.parse(entry.published)
98
+ elif hasattr(entry, 'updated'):
99
+ news_time = dateutil_parser.parse(entry.updated)
100
+ else:
101
+ continue
102
+
103
+ # Skip old articles
104
+ if news_time < time_cutoff:
105
+ continue
106
+
107
+ # Resolve final article URL
108
+ final_url = resolve_redirect(entry.link) if hasattr(entry, 'link') else ''
109
+
110
+ # Estimate relative time (like '3 hours ago')
111
+ time_diff = datetime.now(timezone.utc) - news_time
112
+ if time_diff.days > 0:
113
+ relative_date = f"{time_diff.days} days ago"
114
+ elif time_diff.seconds >= 3600:
115
+ relative_date = f"{time_diff.seconds // 3600} hours ago"
116
+ else:
117
+ relative_date = f"{time_diff.seconds // 60} minutes ago"
118
+
119
+ # Extract domain for media name
120
+ domain_parts = extract(final_url)
121
+ media = domain_parts.domain.capitalize() if domain_parts.domain else "Unknown"
122
+
123
+ # Build result dict
124
+ article_dict = {
125
+ 'title': entry.title if hasattr(entry, 'title') else '',
126
+ 'media': media,
127
+ 'domain': media,
128
+ 'date': relative_date,
129
+ 'datetime': news_time,
130
+ 'link': final_url,
131
+ 'desc': entry.summary if hasattr(entry, 'summary') else '',
132
+ 'img': getattr(entry, 'media_content', [{}])[0].get('url', '') if hasattr(entry, 'media_content') else ''
133
+ }
134
+
135
+ print(f"{article_dict}\n")
136
+
137
+ results.append(article_dict)
138
+
139
+ except Exception as inner_e:
140
+ print(f"Error parsing entry: {inner_e}")
141
+ continue
142
+
143
+ except Exception as e:
144
+ print("Error aggregating news " + str(e))
145
+
146
+ print(f"I found {len(results)} items.")
147
+ return results
148
+
149
+ import requests
150
+ import pandas as pd
151
+ from datetime import datetime, timedelta
152
+ from urllib.parse import urlparse
153
+ import time
154
+
155
+ def get_newsapi_articles(query="AI Agents", cutoff=1, api_key=None):
156
+ """
157
+ Get news articles from NewsAPI.org (Free tier: 1000 requests/month)
158
+ Sign up at: https://newsapi.org/
159
+ """
160
+ if not api_key:
161
+ print("NewsAPI requires an API key. Sign up at https://newsapi.org/")
162
+ return []
163
+
164
+ days = cutoff
165
+ from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')
166
+
167
+ newsapi_key=os.getenv('NEWSAPI')
168
+
169
+ url = "https://newsapi.org/v2/everything"
170
+ params = {
171
+ 'q': query,
172
+ 'from': from_date,
173
+ 'sortBy': 'publishedAt',
174
+ 'language': 'en',
175
+ 'apiKey': newsapi_key,
176
+ 'pageSize': 50
177
+ }
178
+
179
+ try:
180
+ response = requests.get(url, params=params)
181
+ print(response)
182
+ response.raise_for_status()
183
+ data = response.json()
184
+
185
+ articles = []
186
+ for article in data.get('articles', []):
187
+ domain = urlparse(article['url']).netloc
188
+ articles.append({
189
+ 'title': article['title'],
190
+ 'link': article['url'],
191
+ 'date': article['publishedAt'][:10],
192
+ 'datetime': article['publishedAt'],
193
+ 'desc': article['description'] or '',
194
+ 'domain': domain,
195
+ 'source': article['source']['name']
196
+ })
197
+
198
+ return articles
199
+ except Exception as e:
200
+ print(f"Error fetching from NewsAPI: {e}")
201
+ return []
202
+
203
+ def get_gnews_articles(query="AI Agents", cutoff=1):
204
+ """
205
+ Get news articles from GNews (No API key required, but has rate limits)
206
+ Completely free but limited to 100 requests per day
207
+ """
208
+ import json
209
+
210
+ days = cutoff
211
+ api_key=os.environ['GNEWSAPI']
212
+ from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')
213
+
214
+ url = "https://gnews.io/api/v4/search"
215
+
216
+ #https://gnews.io/api/v4/search?q=Google&lang=en&max=5&apikey=YOUR_API_KEY
217
+
218
+ params = {
219
+ 'q': query,
220
+ 'apikey':api_key,
221
+ 'lang': 'en',
222
+ 'max': 25,
223
+ 'from': from_date + 'T00:00:00Z',
224
+ 'to': datetime.today().strftime('%Y-%m-%d') + 'T23:59:59Z',
225
+ }
226
+
227
+ try:
228
+ response = requests.get(url, params=params)
229
+ response.raise_for_status()
230
+ data = response.json()
231
+
232
+ with open('data_output.json', 'w') as f:
233
+ json.dump(response.json(), f, indent=2)
234
+
235
+ except Exception as e:
236
+ print(f"Error fetching from GNews: {e}")
237
+ return []
238
+
239
+
240
+ articles = data.get('articles', [])
241
+
242
+ rArticles=[]
243
+
244
+ for article in articles:
245
+
246
+ #try:
247
+
248
+ link= article.get('url', "")
249
+ domain = urlparse(link).netloc
250
+
251
+ rArticles.append({
252
+ 'title': article['title'],
253
+ 'link': article.get('url', ""),
254
+ 'date': article.get('publishedAt', ""),
255
+ 'datetime': article.get('publishedAt', ""),
256
+ 'desc': article.get('description', ""),
257
+ 'domain': domain,
258
+ 'media': domain,
259
+ 'source': article['source']['name'],
260
+ })
261
+
262
+
263
+ #except Exception as e:
264
+ # print(f"Error preparing from GNews: {e}")
265
+ # continue
266
+
267
+ return rArticles
268
+
269
+ def get_arxiv_papers(query="AI Agents", cutoff=7, max_results=25):
270
+ """
271
+ Get recent papers from Arxiv for a given keyword.
272
+ Uses the Arxiv API (no API key required).
273
+
274
+ Args:
275
+ query (str): Search keyword(s).
276
+ cutoff (int): How many days back to search.
277
+ max_results (int): Maximum number of results to return.
278
+
279
+ Returns:
280
+ list of dicts with paper metadata.
281
+ """
282
+ import json
283
+ import requests
284
+ from datetime import datetime, timedelta
285
+ from urllib.parse import urlencode
286
+ import xml.etree.ElementTree as ET
287
+
288
+ # Calculate date range
289
+ from_date = (datetime.today() - timedelta(days=cutoff)).strftime('%Y%m%d%H%M%S')
290
+ to_date = datetime.today().strftime('%Y%m%d%H%M%S')
291
+
292
+ # Arxiv API endpoint
293
+ base_url = "http://export.arxiv.org/api/query?"
294
+
295
+ if not isinstance(max_results, int) or max_results <= 0:
296
+ max_results = 25 # fallback to safe default
297
+
298
+
299
+ # Construct query (Arxiv search syntax: all:keyword)
300
+ search_query = f"all:{query}"
301
+
302
+ params = {
303
+ "search_query": search_query,
304
+ "start": 0,
305
+ "max_results": max_results,
306
+ "sortBy": "submittedDate",
307
+ "sortOrder": "descending",
308
+ }
309
+
310
+ url = base_url + urlencode(params)
311
+
312
+ try:
313
+ response = requests.get(url, timeout=10)
314
+ response.raise_for_status()
315
+ root = ET.fromstring(response.text)
316
+ except Exception as e:
317
+ print(f"Error fetching from Arxiv: {e}")
318
+ return []
319
+
320
+ print(response)
321
+
322
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
323
+
324
+ papers = []
325
+ for entry in root.findall("atom:entry", ns):
326
+ published = entry.find("atom:published", ns).text
327
+ published_dt = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
328
+
329
+ # Filter by cutoff
330
+ if published_dt < (datetime.today() - timedelta(days=cutoff)):
331
+ continue
332
+
333
+ link = entry.find("atom:id", ns).text
334
+ pdf_link = link.replace("/abs/", "/pdf/")
335
+
336
+ title = entry.find("atom:title", ns).text.strip()
337
+ summary = entry.find("atom:summary", ns).text.strip()
338
+ authors = [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)]
339
+
340
+ papers.append({
341
+ "title": title,
342
+ "link": pdf_link,
343
+ "date": published,
344
+ "datetime": published_dt.isoformat(),
345
+ "desc": summary,
346
+ "authors": authors,
347
+ "source": "arXiv",
348
+ "domain": "arxiv.org",
349
+ "media": "arxiv.org",
350
+ })
351
+
352
+ # "link": "http://arxiv.org/abs/2509.09656v1",
353
+ #https://arxiv.org/pdf/2509.09656v1
354
+
355
+ # Optional: save to JSON
356
+ with open("arxiv_output.json", "w") as f:
357
+ json.dump(papers, f, indent=2)
358
+
359
+ return papers
360
+
361
+ def get_rss_feed_articles(rss_url, query="AI Agents", cutoff=1):
362
+ """
363
+ Parse RSS feeds for news articles (Completely free)
364
+ Example RSS feeds:
365
+ - BBC: http://feeds.bbci.co.uk/news/rss.xml
366
+ - Reuters: http://feeds.reuters.com/reuters/topNews
367
+ - AP News: https://rsshub.app/ap/topics/apf-topnews
368
+ """
369
+ try:
370
+ import feedparser
371
+
372
+ feed = feedparser.parse(rss_url)
373
+ articles = []
374
+ time_cutoff = datetime.now() - timedelta(days=cutoff)
375
+
376
+ for entry in feed.entries:
377
+ # Simple keyword matching
378
+ if query.lower() in entry.title.lower() or query.lower() in entry.get('summary', '').lower():
379
+ try:
380
+ # Parse publication date
381
+ pub_date = datetime(*entry.published_parsed[:6])
382
+ if pub_date >= time_cutoff:
383
+ domain = urlparse(entry.link).netloc
384
+ articles.append({
385
+ 'title': entry.title,
386
+ 'link': entry.link,
387
+ 'date': pub_date.strftime('%Y-%m-%d'),
388
+ 'datetime': pub_date.isoformat(),
389
+ 'desc': entry.get('summary', '')[:200] + '...' if len(entry.get('summary', '')) > 200 else entry.get('summary', ''),
390
+ 'domain': domain,
391
+ 'source': feed.feed.get('title', 'RSS Feed')
392
+ })
393
+ except:
394
+ continue
395
+
396
+ return articles
397
+ except ImportError:
398
+ print("RSS parsing requires feedparser: pip install feedparser")
399
+ return []
400
+ except Exception as e:
401
+ print(f"Error parsing RSS feed: {e}")
402
+ return []
403
+
404
+ # Example usage function that mirrors your original structure
405
+ def get_news_articles(query="AI Agents", cutoff_days=1, api_choice="newsapi", api_key=None):
406
+ """
407
+ Main function to get news articles from various sources
408
+
409
+ Args:
410
+ query: Search term
411
+ cutoff_days: How many days back to search
412
+ api_choice: 'newsapi', 'guardian', 'currents', 'gnews', or 'rss'
413
+ api_key: API key if required
414
+ """
415
+
416
+ if api_choice == "newsapi":
417
+ news_articles = get_newsapi_articles(query, cutoff_days, api_key)
418
+ elif api_choice == "arxiv":
419
+ news_articles = get_arxiv_papers(query, 90, 10)
420
+ elif api_choice == "gnews":
421
+ news_articles = get_gnews_articles(query, cutoff_days)
422
+ elif api_choice == "rss":
423
+ # Example with BBC RSS feed
424
+ rss_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
425
+ news_articles = get_rss_feed_articles(rss_url, query, cutoff_days)
426
+ else:
427
+ print("Invalid API choice")
428
+ return [], pd.DataFrame()
429
+
430
+ if not news_articles:
431
+ return "No news articles found for the given query and time period.", pd.DataFrame()
432
+
433
+ # Create DataFrame for display (matching your original structure)
434
+ display_data = []
435
+ for i, article in enumerate(news_articles):
436
+ display_data.append({
437
+ 'Index': i,
438
+ 'Title': article['title'],
439
+ 'Link': article['link'],
440
+ 'Date': article['date'],
441
+ 'Description': article['desc'][:100] + "..." if len(article['desc']) > 100 else article['desc'],
442
+ 'Domain': article['domain']
443
+ })
444
+
445
+ return news_articles, pd.DataFrame(display_data)