Spaces:

jsemrau
/

Postwriter

Runtime error

App Files Files Community

jsemrau commited on Sep 13, 2025

Commit

9e5dff8

1 Parent(s): 3e15bc5

added connector

Browse files

Files changed (1) hide show

connector.py +445 -0

connector.py ADDED Viewed

	@@ -0,0 +1,445 @@

+import gradio as gr
+import time
+from datetime import date, timedelta, timezone, datetime
+import os
+import pandas as pd
+import numpy as np
+import logging
+import requests
+from bs4 import BeautifulSoup
+import urllib.parse
+import dateutil.parser
+from dateutil import parser as dateutil_parser
+from tldextract import extract
+from urllib.parse import quote_plus
+from collections import defaultdict
+from dotenv import load_dotenv
+from GoogleNews import GoogleNews
+import feedparser
+# Your existing functions (unchanged)
+def get_google_news(query="AI Agents", cutoff=1):
+    """Get Google News articles based on query"""
+    days = cutoff
+    language = 'en'
+    to_day = datetime.today().strftime('%m/%d/%Y')
+    from_day = (datetime.today() - timedelta(days=days)).strftime('%m/%d/%Y')
+    blackList=' -site:winbuzzer.com  -site:x.com -site:threads.com -site:instagram.com -site:linkedin.com -site:facebook.com -site:tiktok.com -site:reddit.com -site:youtube.com -site:newser.com -site:adexchanger.com -india -crypto -blockchain -bitcoin -DeFi'
+    tQuery=query+blackList
+    str_div = []
+    print(f"Assembling news with cutoff {cutoff} for query: {str(tQuery)} ")
+    try:
+        googlenews = GoogleNews(start=from_day, end=to_day, lang=language)
+        googlenews.search(tQuery)
+        page1 = googlenews.result()
+        df = pd.DataFrame(page1)
+        time_cutoff = datetime.now() - timedelta(days=cutoff)
+        for index, row in df.iterrows():
+            try:
+                news_time = dateutil.parser.parse(str(row['datetime']))
+                if news_time >= time_cutoff:
+                    domain = extract(row['link']).domain
+                    str_a = row.to_dict()
+                    str_a['datetime'] = str(news_time)
+                    str_a.update({'domain': domain})
+                    str_div.append(str_a)
+                else:
+                    print(f" Skipping {news_time} > {time_cutoff}")
+            except Exception as inner_e:
+                print(f"Error parsing datetime for row {index}: {inner_e}")
+                continue
+    except Exception as e:
+        print("Error aggregating news " + str(e))
+    return str_div
+def resolve_redirect(url):
+    try:
+        response = requests.head(url, allow_redirects=True, timeout=5)
+        return response.url
+    except Exception as e:
+        print(f"Redirect failed: {e}")
+        return url
+def get_google_news_new(query="AI Agents", cutoff=1):
+    """Get Google News articles based on query using RSS feed, output similar to GoogleNews package"""
+    results = []
+    print("Assembling news for " + str(query))
+    try:
+        # Create RSS URL with proper encoding
+        encoded_query = query.replace(' ', '+')
+        url = f"https://news.google.com/rss/search?q={encoded_query}"
+        # Parse the RSS feed
+        feed = feedparser.parse(url)
+        # Set time cutoff
+        time_cutoff = datetime.now(timezone.utc) - timedelta(days=cutoff)
+        for entry in feed.entries:
+            try:
+                # Parse the published or updated date
+                if hasattr(entry, 'published'):
+                    news_time = dateutil_parser.parse(entry.published)
+                elif hasattr(entry, 'updated'):
+                    news_time = dateutil_parser.parse(entry.updated)
+                else:
+                    continue
+                # Skip old articles
+                if news_time < time_cutoff:
+                    continue
+                # Resolve final article URL
+                final_url = resolve_redirect(entry.link) if hasattr(entry, 'link') else ''
+                # Estimate relative time (like '3 hours ago')
+                time_diff = datetime.now(timezone.utc) - news_time
+                if time_diff.days > 0:
+                    relative_date = f"{time_diff.days} days ago"
+                elif time_diff.seconds >= 3600:
+                    relative_date = f"{time_diff.seconds // 3600} hours ago"
+                else:
+                    relative_date = f"{time_diff.seconds // 60} minutes ago"
+                # Extract domain for media name
+                domain_parts = extract(final_url)
+                media = domain_parts.domain.capitalize() if domain_parts.domain else "Unknown"
+                # Build result dict
+                article_dict = {
+                    'title': entry.title if hasattr(entry, 'title') else '',
+                    'media': media,
+                    'domain': media,
+                    'date': relative_date,
+                    'datetime': news_time,
+                    'link': final_url,
+                    'desc': entry.summary if hasattr(entry, 'summary') else '',
+                    'img': getattr(entry, 'media_content', [{}])[0].get('url', '') if hasattr(entry, 'media_content') else ''
+                }
+                print(f"{article_dict}\n")
+                results.append(article_dict)
+            except Exception as inner_e:
+                print(f"Error parsing entry: {inner_e}")
+                continue
+    except Exception as e:
+        print("Error aggregating news " + str(e))
+    print(f"I found {len(results)} items.")
+    return results
+import requests
+import pandas as pd
+from datetime import datetime, timedelta
+from urllib.parse import urlparse
+import time
+def get_newsapi_articles(query="AI Agents", cutoff=1, api_key=None):
+    """
+    Get news articles from NewsAPI.org (Free tier: 1000 requests/month)
+    Sign up at: https://newsapi.org/
+    """
+    if not api_key:
+        print("NewsAPI requires an API key. Sign up at https://newsapi.org/")
+        return []
+    days = cutoff
+    from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')
+    newsapi_key=os.getenv('NEWSAPI')
+    url = "https://newsapi.org/v2/everything"
+    params = {
+        'q': query,
+        'from': from_date,
+        'sortBy': 'publishedAt',
+        'language': 'en',
+        'apiKey': newsapi_key,
+        'pageSize': 50
+    }
+    try:
+        response = requests.get(url, params=params)
+        print(response)
+        response.raise_for_status()
+        data = response.json()
+        articles = []
+        for article in data.get('articles', []):
+            domain = urlparse(article['url']).netloc
+            articles.append({
+                'title': article['title'],
+                'link': article['url'],
+                'date': article['publishedAt'][:10],
+                'datetime': article['publishedAt'],
+                'desc': article['description'] or '',
+                'domain': domain,
+                'source': article['source']['name']
+            })
+        return articles
+    except Exception as e:
+        print(f"Error fetching from NewsAPI: {e}")
+        return []
+def get_gnews_articles(query="AI Agents", cutoff=1):
+    """
+    Get news articles from GNews (No API key required, but has rate limits)
+    Completely free but limited to 100 requests per day
+    """
+    import json
+    days = cutoff
+    api_key=os.environ['GNEWSAPI']
+    from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')
+    url = "https://gnews.io/api/v4/search"
+    #https://gnews.io/api/v4/search?q=Google&lang=en&max=5&apikey=YOUR_API_KEY
+    params = {
+        'q': query,
+        'apikey':api_key,
+        'lang': 'en',
+        'max': 25,
+        'from': from_date + 'T00:00:00Z',
+        'to': datetime.today().strftime('%Y-%m-%d') + 'T23:59:59Z',
+    }
+    try:
+        response = requests.get(url, params=params)
+        response.raise_for_status()
+        data = response.json()
+        with open('data_output.json', 'w') as f:
+            json.dump(response.json(), f, indent=2)
+    except Exception as e:
+        print(f"Error fetching from GNews: {e}")
+        return []
+    articles = data.get('articles', [])
+    rArticles=[]
+    for article in articles:
+            #try:
+            link= article.get('url', "")
+            domain = urlparse(link).netloc
+            rArticles.append({
+                'title': article['title'],
+                'link':  article.get('url', ""),
+                'date': article.get('publishedAt', ""),
+                'datetime': article.get('publishedAt', ""),
+                'desc':  article.get('description', ""),
+                'domain': domain,
+                'media': domain,
+                'source': article['source']['name'],
+            })
+            #except Exception as e:
+            #    print(f"Error preparing from GNews: {e}")
+            #    continue
+    return rArticles
+def get_arxiv_papers(query="AI Agents", cutoff=7, max_results=25):
+    """
+    Get recent papers from Arxiv for a given keyword.
+    Uses the Arxiv API (no API key required).
+    Args:
+        query (str): Search keyword(s).
+        cutoff (int): How many days back to search.
+        max_results (int): Maximum number of results to return.
+    Returns:
+        list of dicts with paper metadata.
+    """
+    import json
+    import requests
+    from datetime import datetime, timedelta
+    from urllib.parse import urlencode
+    import xml.etree.ElementTree as ET
+    # Calculate date range
+    from_date = (datetime.today() - timedelta(days=cutoff)).strftime('%Y%m%d%H%M%S')
+    to_date = datetime.today().strftime('%Y%m%d%H%M%S')
+    # Arxiv API endpoint
+    base_url = "http://export.arxiv.org/api/query?"
+    if not isinstance(max_results, int) or max_results <= 0:
+            max_results = 25  # fallback to safe default
+    # Construct query (Arxiv search syntax: all:keyword)
+    search_query = f"all:{query}"
+    params = {
+        "search_query": search_query,
+        "start": 0,
+        "max_results": max_results,
+        "sortBy": "submittedDate",
+        "sortOrder": "descending",
+    }
+    url = base_url + urlencode(params)
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        root = ET.fromstring(response.text)
+    except Exception as e:
+        print(f"Error fetching from Arxiv: {e}")
+        return []
+    print(response)
+    ns = {"atom": "http://www.w3.org/2005/Atom"}
+    papers = []
+    for entry in root.findall("atom:entry", ns):
+        published = entry.find("atom:published", ns).text
+        published_dt = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
+        # Filter by cutoff
+        if published_dt < (datetime.today() - timedelta(days=cutoff)):
+            continue
+        link = entry.find("atom:id", ns).text
+        pdf_link = link.replace("/abs/", "/pdf/")
+        title = entry.find("atom:title", ns).text.strip()
+        summary = entry.find("atom:summary", ns).text.strip()
+        authors = [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)]
+        papers.append({
+            "title": title,
+            "link": pdf_link,
+            "date": published,
+            "datetime": published_dt.isoformat(),
+            "desc": summary,
+            "authors": authors,
+            "source": "arXiv",
+            "domain": "arxiv.org",
+            "media": "arxiv.org",
+        })
+        # "link": "http://arxiv.org/abs/2509.09656v1",
+        #https://arxiv.org/pdf/2509.09656v1
+    # Optional: save to JSON
+    with open("arxiv_output.json", "w") as f:
+        json.dump(papers, f, indent=2)
+    return papers
+def get_rss_feed_articles(rss_url, query="AI Agents", cutoff=1):
+    """
+    Parse RSS feeds for news articles (Completely free)
+    Example RSS feeds:
+    - BBC: http://feeds.bbci.co.uk/news/rss.xml
+    - Reuters: http://feeds.reuters.com/reuters/topNews
+    - AP News: https://rsshub.app/ap/topics/apf-topnews
+    """
+    try:
+        import feedparser
+        feed = feedparser.parse(rss_url)
+        articles = []
+        time_cutoff = datetime.now() - timedelta(days=cutoff)
+        for entry in feed.entries:
+            # Simple keyword matching
+            if query.lower() in entry.title.lower() or query.lower() in entry.get('summary', '').lower():
+                try:
+                    # Parse publication date
+                    pub_date = datetime(*entry.published_parsed[:6])
+                    if pub_date >= time_cutoff:
+                        domain = urlparse(entry.link).netloc
+                        articles.append({
+                            'title': entry.title,
+                            'link': entry.link,
+                            'date': pub_date.strftime('%Y-%m-%d'),
+                            'datetime': pub_date.isoformat(),
+                            'desc': entry.get('summary', '')[:200] + '...' if len(entry.get('summary', '')) > 200 else entry.get('summary', ''),
+                            'domain': domain,
+                            'source': feed.feed.get('title', 'RSS Feed')
+                        })
+                except:
+                    continue
+        return articles
+    except ImportError:
+        print("RSS parsing requires feedparser: pip install feedparser")
+        return []
+    except Exception as e:
+        print(f"Error parsing RSS feed: {e}")
+        return []
+# Example usage function that mirrors your original structure
+def get_news_articles(query="AI Agents", cutoff_days=1, api_choice="newsapi", api_key=None):
+    """
+    Main function to get news articles from various sources
+    Args:
+        query: Search term
+        cutoff_days: How many days back to search
+        api_choice: 'newsapi', 'guardian', 'currents', 'gnews', or 'rss'
+        api_key: API key if required
+    """
+    if api_choice == "newsapi":
+        news_articles = get_newsapi_articles(query, cutoff_days, api_key)
+    elif api_choice == "arxiv":
+        news_articles = get_arxiv_papers(query, 90, 10)
+    elif api_choice == "gnews":
+        news_articles = get_gnews_articles(query, cutoff_days)
+    elif api_choice == "rss":
+        # Example with BBC RSS feed
+        rss_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
+        news_articles = get_rss_feed_articles(rss_url, query, cutoff_days)
+    else:
+        print("Invalid API choice")
+        return [], pd.DataFrame()
+    if not news_articles:
+        return "No news articles found for the given query and time period.", pd.DataFrame()
+    # Create DataFrame for display (matching your original structure)
+    display_data = []
+    for i, article in enumerate(news_articles):
+        display_data.append({
+            'Index': i,
+            'Title': article['title'],
+            'Link': article['link'],
+            'Date': article['date'],
+            'Description': article['desc'][:100] + "..." if len(article['desc']) > 100 else article['desc'],
+            'Domain': article['domain']
+        })
+    return news_articles, pd.DataFrame(display_data)