Spaces:

mdAmin313
/

news

Sleeping

App Files Files Community

mdAmin313 commited on Nov 2, 2025

Commit

32da598

verified ·

1 Parent(s): 372170e

Create _data_fetcher.py

Browse files

Files changed (1) hide show

_data_fetcher.py +146 -0

_data_fetcher.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os
+import requests
+import json
+import base64
+from typing import Dict, Any, Optional, Tuple
+from bs4 import BeautifulSoup
+from functools import lru_cache
+import logging
+from ._config import SERPAPI_KEY, SAFE_BROWSING_KEY, VIRUSTOTAL_KEY, SOURCE_TRUST
+from ._utils import is_host_public, domain_from_url, sanitize_text
+logger = logging.getLogger("fact_checker_fetcher")
+@lru_cache(maxsize=256)
+def serpapi_search(query: str, type: str, num: int = 6) -> Dict[str, Any]:
+    """Unified function for SerpApi web search and reverse image search."""
+    if not SERPAPI_KEY:
+        return {"available": False, "note": "SERPAPI_KEY not set"}
+    try:
+        params = {"engine": "google", "q": query, "num": num, "api_key": SERPAPI_KEY}
+        if type == "reverse_image":
+            params.pop("q")
+            params["image_url"] = query
+        r = requests.get("https://serpapi.com/search.json", params=params, timeout=12)
+        r.raise_for_status()
+        return {"available": True, "result": r.json()}
+    except Exception as e:
+        logger.exception(f"SerpApi {type} failed")
+        return {"available": True, "error": str(e)}
+def google_safe_browsing_check(url: str) -> dict:
+    """Checks URL against Google Safe Browsing API."""
+    if not SAFE_BROWSING_KEY: return {"safe": None, "error": "API Key Missing"}
+    try:
+        endpoint = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={SAFE_BROWSING_KEY}"
+        body = {
+            "client": {"clientId": "newsorchestra", "clientVersion": "1.0"},
+            "threatInfo": {
+                "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE", "POTENTIALLY_HARMFUL_APPLICATION"],
+                "platformTypes": ["ANY_PLATFORM"], "threatEntryTypes": ["URL"], "threatEntries": [{"url": url}]
+            }
+        }
+        r = requests.post(endpoint, json=body, timeout=10)
+        r.raise_for_status()
+        data = r.json()
+        return {"safe": "matches" not in data, "matches": data.get("matches", [])}
+    except Exception as e:
+        return {"safe": None, "error": str(e)}
+def virustotal_url_check(url: str) -> dict:
+    """Checks URL against VirusTotal API."""
+    if not VIRUSTOTAL_KEY: return {"safe": None, "error": "API Key Missing"}
+    try:
+        headers = {"x-apikey": VIRUSTOTAL_KEY}
+        url_id = base64.urlsafe_b64encode(url.encode()).decode().strip("=")
+        vt_url = f"https://www.virustotal.com/api/v3/urls/{url_id}"
+        r = requests.get(vt_url, headers=headers, timeout=15)
+        # If not found, submit for analysis
+        if r.status_code == 404:
+            requests.post("https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}, timeout=15)
+            return {"safe": None, "submitted": True}
+        r.raise_for_status()
+        stats = r.json().get("data", {}).get("attributes", {}).get("last_analysis_stats", {})
+        malicious = stats.get("malicious", 0)
+        suspicious = stats.get("suspicious", 0)
+        return {"safe": malicious == 0 and suspicious == 0, "malicious_votes": malicious, "suspicious_votes": suspicious}
+    except Exception as e:
+        return {"safe": None, "error": str(e)}
+def phishing_checks(url: str) -> dict:
+    """Combines GSB and VirusTotal checks."""
+    if not url: return {}
+    out = {"url": url}
+    out["safe_browsing"] = google_safe_browsing_check(url)
+    out["virustotal"] = virustotal_url_check(url)
+    return out
+def fetch_article_text_from_url(url: str) -> Tuple[str, str]:
+    """Fetches and extracts the main article text and headline from a URL."""
+    if not is_host_public(url):
+        logger.warning("Blocked fetch_article_text_from_url for private host: %s", url)
+        return "", ""
+    try:
+        r = requests.get(url, timeout=10, headers={"User-Agent": "newsorchestra/1.0"})
+        r.raise_for_status()
+        soup = BeautifulSoup(r.text, "html.parser")
+        # 1. Try JSON-LD/Schema.org extraction first
+        scripts = soup.find_all("script", type="application/ld+json")
+        for s in scripts:
+            try:
+                parsed = json.loads(s.string or s.get_text())
+                items = parsed if isinstance(parsed, list) else [parsed]
+                for item in items:
+                    if isinstance(item, dict) and item.get("@type") in ("NewsArticle", "Article"):
+                        headline = item.get("headline") or item.get("name") or ""
+                        body = item.get("articleBody") or item.get("description") or ""
+                        if isinstance(body, list): body = " ".join([str(x) for x in body if x])
+                        if body: return sanitize_text(str(body)), sanitize_text(str(headline) or "")
+            except Exception: continue
+        # 2. Fallback to general HTML parsing
+        article_tag = soup.find("article")
+        if article_tag:
+            paras = [p.get_text(" ", strip=True) for p in article_tag.find_all("p")]
+        else:
+            main = soup.find("main") or soup.find(id="main") or soup
+            paras = [p.get_text(" ", strip=True) for p in main.find_all("p")]
+        article_text = "\n\n".join([p for p in paras if len(p) > 40])
+        headline = soup.title.get_text(strip=True) if soup.title else ""
+        return article_text or "", headline or ""
+    except Exception:
+        logger.exception("fetch_article_text_from_url failed")
+        return "", ""
+def aggregate_search_results(serpapi_result: dict) -> Dict[str, Any]:
+    """Analyzes SERP results for domain trust and counts."""
+    from collections import Counter # local import for this function
+    if not serpapi_result or not serpapi_result.get("available") or not serpapi_result.get("result"):
+        return {"evidence": [], "consensus": {"top_trust_avg": 0.5, "top_domains": {}}}
+    organic = serpapi_result["result"].get("organic_results", []) or []
+    evidence = []
+    domains = Counter()
+    for r in organic[:12]:
+        link = r.get("link") or r.get("displayed_link") or ""
+        domain = domain_from_url(link)
+        trust = SOURCE_TRUST.get(domain, 0.6)
+        evidence.append({
+            "title": r.get("title", ""), "snippet": r.get("snippet", ""), "link": link,
+            "domain": domain, "trust": round(trust, 2)
+        })
+        if domain: domains[domain] += 1
+    top3 = evidence[:3]
+    top_trust_avg = sum([e["trust"] for e in top3]) / len(top3) if top3 else 0.5
+    return {
+        "evidence": evidence,
+        "consensus": {"top_trust_avg": round(top_trust_avg, 2), "top_domains": dict(domains.most_common(5))},
+    }