ANISA09 commited on
Commit
2707d13
·
verified ·
1 Parent(s): 6efdc98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -1391
app.py CHANGED
@@ -11,10 +11,6 @@ from typing import Optional, Dict, Any, Tuple, List
11
  from urllib.parse import urlparse
12
  from functools import lru_cache
13
  from collections import Counter
14
- from urllib.parse import quote_plus
15
- import time
16
-
17
-
18
 
19
  import requests
20
  from bs4 import BeautifulSoup
@@ -72,151 +68,6 @@ SOURCE_TRUST = {
72
  # ------------------------
73
  # Helpers
74
  # ------------------------
75
- @lru_cache(maxsize=512)
76
- def build_dork_queries(claim: str) -> Dict[str, str]:
77
- """
78
- Return a dict of platform -> query string (Google-dork style).
79
- """
80
- q = claim.strip().strip('"')
81
- # Use quoted claim to reduce noise
82
- quoted = f'"{q}"'
83
- return {
84
- "web": quoted,
85
- "x": f'(site:twitter.com OR site:x.com) {quoted}',
86
- "facebook": f'site:facebook.com {quoted}',
87
- "instagram": f'(site:instagram.com OR site:instagr.am) {quoted}',
88
- }
89
-
90
- def _parse_google_like_results(html: str, max_links: int = 6) -> List[Dict[str, str]]:
91
- """Try to extract (title, snippet, link) from Google-style HTML. Very best-effort."""
92
- out = []
93
- try:
94
- soup = BeautifulSoup(html, "html.parser")
95
- # Google 'g' blocks
96
- blocks = soup.find_all("div", class_="g")
97
- if not blocks:
98
- # DuckDuckGo's HTML fallback structure
99
- blocks = soup.find_all("div", class_="result")
100
- for b in blocks:
101
- a = b.find("a", href=True)
102
- title = ""
103
- snippet = ""
104
- href = None
105
- if a:
106
- href = a.get("href")
107
- # title may be in <h3> or anchor text
108
- h3 = b.find("h3")
109
- if h3:
110
- title = h3.get_text(" ", strip=True)
111
- else:
112
- title = a.get_text(" ", strip=True)
113
- # snippet
114
- s = b.find("span", class_="aCOpRe") or b.find("a", class_="result__snippet") or b.find("p")
115
- if s:
116
- snippet = s.get_text(" ", strip=True)
117
- if href and href.startswith("/url?q="):
118
- # Google sometimes returns redirect format /url?q=<real>&...
119
- m = re.search(r"/url\?q=(https?[^&]+)", href)
120
- if m:
121
- href = m.group(1)
122
- if href and href.startswith("http"):
123
- out.append({"title": title[:200], "snippet": snippet[:300], "link": href})
124
- if len(out) >= max_links:
125
- break
126
- except Exception:
127
- logger.exception("parse_google_like_results failed")
128
- return out
129
-
130
- def _html_search_free(query: str, num: int = 6, engine: str = "duckduckgo") -> List[Dict[str, str]]:
131
- """
132
- Perform a simple GET-based search. Prefer DuckDuckGo HTML front-end as it's friendlier for scraping.
133
- Returns list of {title, snippet, link}. Best-effort, fragile.
134
- """
135
- headers = {"User-Agent": "newsorchestra/1.0 (spread-tracker)", "Accept-Language": "en-US,en;q=0.9"}
136
- try:
137
- if engine == "duckduckgo":
138
- url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
139
- r = requests.get(url, headers=headers, timeout=12)
140
- r.raise_for_status()
141
- time.sleep(0.6) # polite short delay
142
- return _parse_google_like_results(r.text, max_links=num)
143
- else:
144
- # Best-effort Google attempt (more likely to be blocked)
145
- url = f"https://www.google.com/search?q={quote_plus(query)}&num={num}"
146
- r = requests.get(url, headers=headers, timeout=12)
147
- r.raise_for_status()
148
- time.sleep(0.8)
149
- return _parse_google_like_results(r.text, max_links=num)
150
- except Exception as e:
151
- logger.debug("html_search_free failed for query=%s: %s", query, e)
152
- return []
153
-
154
- def track_spread_for_claim(claim: str, platforms: List[str] = None, max_per_platform: int = 6) -> Dict[str, Any]:
155
- """
156
- Track spread for a short claim across platforms using dork queries.
157
- Returns a dict with per-platform counts and sample links.
158
- Uses SerpApi if configured; otherwise falls back to free HTML scraping (DuckDuckGo / Google).
159
- """
160
- if not claim:
161
- return {}
162
- if platforms is None:
163
- platforms = ["web", "x", "facebook", "instagram"]
164
-
165
- queries = build_dork_queries(claim)
166
- out = {"claim": claim, "checked_at": time.time(), "platforms": {}}
167
-
168
- for p in platforms:
169
- q = queries.get(p, claim)
170
- results = []
171
- # Prefer SerpApi if available
172
- if SERPAPI_KEY:
173
- try:
174
- serp = serpapi_web_search(q, num=max_per_platform)
175
- if serp.get("available") and serp.get("result"):
176
- organic = serp["result"].get("organic_results", []) or []
177
- for r in organic[:max_per_platform]:
178
- results.append({
179
- "title": r.get("title") or "",
180
- "snippet": r.get("snippet") or "",
181
- "link": r.get("link") or r.get("displayed_link") or ""
182
- })
183
- except Exception:
184
- logger.exception("SerpApi path failed for spread tracker. Falling back to HTML.")
185
- results = []
186
- if not results:
187
- results = _html_search_free(q, num=max_per_platform, engine="duckduckgo")
188
- # final fallback: try google
189
- if not results:
190
- results = _html_search_free(q, num=max_per_platform, engine="google")
191
-
192
- unique_links = []
193
- seen = set()
194
- for r in results:
195
- link = (r.get("link") or "").split("&")[0]
196
- if not link:
197
- continue
198
- if link in seen:
199
- continue
200
- seen.add(link)
201
- unique_links.append(r)
202
- count = len(unique_links)
203
- top_domains = Counter([_domain_from_url(r["link"]) for r in unique_links if r.get("link")]) .most_common(5)
204
- out["platforms"][p] = {
205
- "query": q,
206
- "count": count,
207
- "sample_results": unique_links[:6],
208
- "top_domains": top_domains
209
- }
210
- # Simple spread score: normalized by sum of counts (0..1) for quick glance
211
- total = sum([out["platforms"][p]["count"] for p in out["platforms"]])
212
- spread = {}
213
- for p in out["platforms"]:
214
- cnt = out["platforms"][p]["count"]
215
- spread[p] = round((cnt / total) if total > 0 else 0.0, 3)
216
- out["spread_fractions"] = spread
217
- out["total_hits_estimated"] = total
218
- return out
219
-
220
  def compute_modal_accuracy(verdicts: list, true_labels: list) -> float:
221
  """
222
  verdicts: list of lists of model outputs per claim, e.g. [["True","False"], ["True","True"], ...]
@@ -1380,1245 +1231,4 @@ with gr.Blocks(title=title) as demo2:
1380
  )
1381
 
1382
  if __name__ == "__main__":
1383
- demo2.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860"))) add a tracking spread which tracks news spread accross insta, facebook, X and web using google dork and free way without apis
1384
-
1385
- import os
1386
- import re
1387
- import json
1388
- import logging
1389
- import traceback
1390
- import time
1391
- import io
1392
- import socket
1393
- import ipaddress
1394
- from typing import Optional, Dict, Any, Tuple, List
1395
- from urllib.parse import urlparse
1396
- from functools import lru_cache
1397
- from collections import Counter
1398
-
1399
- import requests
1400
- from bs4 import BeautifulSoup
1401
- import gradio as gr
1402
- from transformers import pipeline
1403
-
1404
- # IMAGE libs
1405
- from PIL import Image, ImageChops, ImageStat, ExifTags
1406
- import imagehash
1407
-
1408
- # --- GEMINI (genai) REQUIRED ---
1409
- try:
1410
- from google import genai
1411
- except Exception:
1412
- raise SystemExit("gemini (genai) Python client not installed. Run: pip install genai")
1413
-
1414
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
1415
- if not GEMINI_API_KEY:
1416
- raise SystemExit("GEMINI_API_KEY env var is required. Export it before running the app.")
1417
-
1418
- # Initialize Gemini client
1419
- try:
1420
- genai_client = genai.Client(api_key=GEMINI_API_KEY)
1421
- except Exception as e:
1422
- raise SystemExit(f"Failed to init genai client: {e}")
1423
-
1424
- # --- optional SerpApi (web evidence) ---
1425
- SERPAPI_KEY = os.getenv("SERPAPI_KEY")
1426
-
1427
- # --- Transformers (HF) auxiliary ML ---
1428
- HF_ZERO_SHOT = os.getenv("HF_ZERO_SHOT", "facebook/bart-large-mnli")
1429
- try:
1430
- zero_shot = pipeline("zero-shot-classification", model=HF_ZERO_SHOT)
1431
- except Exception as e:
1432
- zero_shot = None
1433
- logging.warning("Zero-shot unavailable: %s", e)
1434
-
1435
- # config
1436
- logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
1437
- logger = logging.getLogger("newsorchestra_gemini")
1438
- SAFE_BROWSING_KEY = os.getenv("SAFE_BROWSING_KEY")
1439
- VIRUSTOTAL_KEY = os.getenv("VIRUSTOTAL_KEY")
1440
-
1441
- CANDIDATE_LABELS = ["True", "False", "Misleading", "Unclear", "Opinionated", "Unsupported"]
1442
-
1443
- SOURCE_TRUST = {
1444
- "reuters.com": 0.95,
1445
- "apnews.com": 0.95,
1446
- "bbc.com": 0.93,
1447
- "theguardian.com": 0.9,
1448
- "nytimes.com": 0.9,
1449
- "washingtonpost.com": 0.9,
1450
- }
1451
-
1452
- # ------------------------
1453
- # Helpers
1454
- # ------------------------
1455
- def compute_modal_accuracy(verdicts: list, true_labels: list) -> float:
1456
- """
1457
- verdicts: list of lists of model outputs per claim, e.g. [["True","False"], ["True","True"], ...]
1458
- true_labels: list of ground truth labels per claim, e.g. ["True", "True", ...]
1459
- Returns: modal accuracy (float)
1460
- """
1461
- if not verdicts or not true_labels or len(verdicts) != len(true_labels):
1462
- return 0.0
1463
- correct = 0
1464
- for v_list, true in zip(verdicts, true_labels):
1465
- if not v_list:
1466
- continue
1467
- mode = Counter(v_list).most_common(1)[0][0]
1468
- if mode == true:
1469
- correct += 1
1470
- return correct / len(true_labels)
1471
-
1472
- def google_safe_browsing_check(url: str, api_key: str) -> dict:
1473
- try:
1474
- endpoint = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={api_key}"
1475
- body = {
1476
- "client": {"clientId": "newsorchestra", "clientVersion": "1.0"},
1477
- "threatInfo": {
1478
- "threatTypes": [
1479
- "MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE", "POTENTIALLY_HARMFUL_APPLICATION"
1480
- ],
1481
- "platformTypes": ["ANY_PLATFORM"],
1482
- "threatEntryTypes": ["URL"],
1483
- "threatEntries": [{"url": url}]
1484
- }
1485
- }
1486
- r = requests.post(endpoint, json=body, timeout=10)
1487
- r.raise_for_status()
1488
- data = r.json()
1489
- if "matches" in data:
1490
- return {"safe": False, "matches": data["matches"]}
1491
- return {"safe": True, "matches": []}
1492
- except Exception as e:
1493
- return {"safe": None, "error": str(e)}
1494
-
1495
- def virustotal_url_check(url: str, api_key: str) -> dict:
1496
- try:
1497
- headers = {"x-apikey": api_key}
1498
- import base64
1499
- url_id = base64.urlsafe_b64encode(url.encode()).decode().strip("=")
1500
- vt_url = f"https://www.virustotal.com/api/v3/urls/{url_id}"
1501
- r = requests.get(vt_url, headers=headers, timeout=15)
1502
- if r.status_code == 404:
1503
- scan_r = requests.post("https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}, timeout=15)
1504
- scan_r.raise_for_status()
1505
- return {"safe": None, "submitted": True}
1506
- r.raise_for_status()
1507
- data = r.json()
1508
- stats = data.get("data", {}).get("attributes", {}).get("last_analysis_stats", {})
1509
- malicious = stats.get("malicious", 0)
1510
- suspicious = stats.get("suspicious", 0)
1511
- safe = malicious == 0 and suspicious == 0
1512
- return {"safe": safe, "malicious_votes": malicious, "suspicious_votes": suspicious}
1513
- except Exception as e:
1514
- return {"safe": None, "error": str(e)}
1515
-
1516
- def sanitize_text(text: Optional[str]) -> str:
1517
- if not text:
1518
- return ""
1519
- t = re.sub(r"<[^>]+>", " ", text)
1520
- t = re.sub(r"\s+", " ", t).strip()
1521
- return t
1522
-
1523
- def _extract_json_from_text(text: str) -> Optional[str]:
1524
- if not text:
1525
- return None
1526
- m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.S | re.I)
1527
- if m:
1528
- return m.group(1)
1529
- start = None
1530
- depth = 0
1531
- for i, ch in enumerate(text):
1532
- if ch == "{":
1533
- if start is None:
1534
- start = i
1535
- depth += 1
1536
- elif ch == "}":
1537
- if depth > 0:
1538
- depth -= 1
1539
- if depth == 0 and start is not None:
1540
- return text[start:i+1]
1541
- return None
1542
-
1543
- def _safe_parse_gemini_json(raw_text: str) -> Optional[dict]:
1544
- jstr = _extract_json_from_text(raw_text)
1545
- if not jstr:
1546
- return None
1547
- try:
1548
- return json.loads(jstr)
1549
- except Exception:
1550
- return None
1551
-
1552
- def _domain_from_url(url: str) -> str:
1553
- try:
1554
- host = urlparse(url).hostname or ""
1555
- return host.lower().lstrip("www.")
1556
- except Exception:
1557
- return ""
1558
-
1559
- def _is_host_public(url: str) -> bool:
1560
- """
1561
- Return False if the hostname resolves to private/local/reserved addresses or is an obvious local name.
1562
- Protects against SSRF attempts when downloading arbitrary URLs.
1563
- """
1564
- try:
1565
- parsed = urlparse(url)
1566
- host = parsed.hostname or ""
1567
- if not host:
1568
- return False
1569
- host = host.strip().lower()
1570
- if host in ("localhost", "ip6-localhost", "::1"):
1571
- return False
1572
- # If host is an IP literal
1573
- try:
1574
- ip = ipaddress.ip_address(host)
1575
- except ValueError:
1576
- # resolve DNS once
1577
- try:
1578
- infos = socket.getaddrinfo(host, None)
1579
- addr = infos[0][4][0]
1580
- ip = ipaddress.ip_address(addr)
1581
- except Exception:
1582
- # If we can't resolve, treat as non-public to be conservative
1583
- return False
1584
- if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_multicast or ip.is_reserved:
1585
- return False
1586
- return True
1587
- except Exception:
1588
- return False
1589
-
1590
- def phishing_checks(url: str) -> dict:
1591
- if not url:
1592
- return {}
1593
- out = {"url": url, "safe_browsing": None, "virustotal": None}
1594
- if SAFE_BROWSING_KEY:
1595
- out["safe_browsing"] = google_safe_browsing_check(url, SAFE_BROWSING_KEY)
1596
- if VIRUSTOTAL_KEY:
1597
- out["virustotal"] = virustotal_url_check(url, VIRUSTOTAL_KEY)
1598
- return out
1599
-
1600
- # ------------------------
1601
- # SerpApi wrappers (optional) with caching
1602
- # ------------------------
1603
- @lru_cache(maxsize=256)
1604
- def serpapi_web_search(query: str, num: int = 6) -> dict:
1605
- if not SERPAPI_KEY:
1606
- return {"available": False, "note": "SERPAPI_KEY not set"}
1607
- try:
1608
- r = requests.get("https://serpapi.com/search.json",
1609
- params={"engine": "google", "q": query, "num": num, "api_key": SERPAPI_KEY},
1610
- timeout=12)
1611
- r.raise_for_status()
1612
- return {"available": True, "result": r.json()}
1613
- except Exception as e:
1614
- logger.exception("SerpApi search failed")
1615
- return {"available": True, "error": str(e)}
1616
-
1617
- @lru_cache(maxsize=256)
1618
- def serpapi_reverse_image(image_url: str, num: int = 6) -> dict:
1619
- if not SERPAPI_KEY:
1620
- return {"available": False, "note": "SERPAPI_KEY not set"}
1621
- try:
1622
- r = requests.get("https://serpapi.com/search.json",
1623
- params={"engine": "google", "q": image_url, "num": num, "api_key": SERPAPI_KEY},
1624
- timeout=12)
1625
- r.raise_for_status()
1626
- return {"available": True, "result": r.json()}
1627
- except Exception as e:
1628
- logger.exception("SerpApi reverse failed")
1629
- return {"available": True, "error": str(e)}
1630
-
1631
- # ------------------------
1632
- # Image analysis helpers
1633
- # ------------------------
1634
- MAX_BYTES = 6 * 1024 * 1024 # 6MB
1635
- ALLOWED_CONTENT_PREFIXES = ("image/",)
1636
-
1637
- def download_image_bytes(url: str, timeout: int = 12) -> Tuple[Optional[bytes], Optional[str]]:
1638
- try:
1639
- if not _is_host_public(url):
1640
- logger.warning("Blocked image download for private/local host: %s", url)
1641
- return None, None
1642
-
1643
- with requests.get(url, timeout=timeout, stream=True, headers={"User-Agent": "newsorchestra/1.0"}) as r:
1644
- r.raise_for_status()
1645
- ct = r.headers.get("Content-Type", "")
1646
- if not any(ct.startswith(p) for p in ALLOWED_CONTENT_PREFIXES):
1647
- logger.warning("Rejected non-image content-type: %s", ct)
1648
- return None, ct
1649
- buf = io.BytesIO()
1650
- total = 0
1651
- for chunk in r.iter_content(8192):
1652
- if not chunk:
1653
- break
1654
- total += len(chunk)
1655
- if total > MAX_BYTES:
1656
- logger.warning("Image too large (%d bytes)", total)
1657
- return None, ct
1658
- buf.write(chunk)
1659
- return buf.getvalue(), ct
1660
- except Exception as e:
1661
- logger.warning("download_image_bytes failed: %s", e)
1662
- return None, None
1663
-
1664
- def extract_exif_from_bytes(img_bytes: bytes) -> dict:
1665
- out = {"has_exif": False, "exif": {}, "has_gps": False}
1666
- try:
1667
- img = Image.open(io.BytesIO(img_bytes))
1668
- exif_raw = getattr(img, "_getexif", lambda: None)()
1669
- if not exif_raw:
1670
- return out
1671
- exif = {ExifTags.TAGS.get(k, k): v for k, v in exif_raw.items()}
1672
- out["has_exif"] = True
1673
- if "GPSInfo" in exif:
1674
- out["has_gps"] = True
1675
- exif.pop("GPSInfo", None)
1676
- out["exif"] = exif
1677
- return out
1678
- except Exception:
1679
- return out
1680
-
1681
- def error_level_analysis_score(img_bytes: bytes, quality: int = 90) -> dict:
1682
- out = {"available": False}
1683
- try:
1684
- orig = Image.open(io.BytesIO(img_bytes)).convert("RGB")
1685
- buf = io.BytesIO()
1686
- orig.save(buf, "JPEG", quality=quality)
1687
- recompr = Image.open(io.BytesIO(buf.getvalue())).convert("RGB")
1688
- diff = ImageChops.difference(orig, recompr)
1689
- stat = ImageStat.Stat(diff)
1690
- mean_val = sum(stat.mean)/len(stat.mean)
1691
- out.update({"available": True, "ela_score": round(float(mean_val), 3)})
1692
- return out
1693
- except Exception:
1694
- return out
1695
-
1696
- def compute_phash(img_bytes: bytes) -> dict:
1697
- try:
1698
- img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
1699
- ph = imagehash.phash(img)
1700
- return {"available": True, "phash": str(ph)}
1701
- except Exception:
1702
- return {"available": False}
1703
-
1704
- def analyze_image_url(image_url: str) -> dict:
1705
- result = {"image_url": image_url, "fetched": False}
1706
- b, ct = download_image_bytes(image_url)
1707
- if not b:
1708
- result["error"] = "download failed"
1709
- return result
1710
- result["fetched"] = True
1711
- result["content_type"] = ct
1712
- result["bytes_length"] = len(b)
1713
- result["exif"] = extract_exif_from_bytes(b)
1714
- result["ela"] = error_level_analysis_score(b)
1715
- result["phash"] = compute_phash(b)
1716
- result["serpapi_reverse"] = serpapi_reverse_image(image_url) if SERPAPI_KEY else {"available": False}
1717
- return result
1718
-
1719
- # ------------------------
1720
- # Gemini functions
1721
- # ------------------------
1722
- GENAI_MODEL = os.getenv("GENAI_MODEL", "gemini-2.5-flash")
1723
-
1724
- def gemini_generate_claim_from_image(image_url: str) -> Optional[str]:
1725
- try:
1726
- img_bytes, _ = download_image_bytes(image_url)
1727
- if not img_bytes:
1728
- return None
1729
- img = Image.open(io.BytesIO(img_bytes))
1730
-
1731
- prompt = (
1732
- "You are a cautious fact-check assistant.\n"
1733
- "Look at the image and, ONLY IF you can identify a plausible short factual claim about the main subject, "
1734
- "return a JSON object **ONLY** inside triple backticks, with the exact keys: claim, rationale.\n\n"
1735
- "Rules:\n"
1736
- "- If you can propose a factual testable claim, set \"claim\" to a short sentence (<= 140 chars) starting with "
1737
- "\"Auto-generated (unverified):\" and use cautious phrasing like 'appears to show' or 'is claimed to show'.\n"
1738
- "- If you cannot identify a testable factual claim, set \"claim\": null and provide a short rationale.\n"
1739
- "- DO NOT output any prose outside the fenced JSON block.\n\n"
1740
- "Example output:\n"
1741
- "```json\n"
1742
- "{\"claim\": \"Auto-generated (unverified): The photo appears to show the mayor speaking at the flood site.\", "
1743
- "\"rationale\": \"person at podium, banner text, context implies event\"}\n"
1744
- "```\n"
1745
- )
1746
- resp = genai_client.models.generate_content(
1747
- model=GENAI_MODEL,
1748
- contents=[prompt, img]
1749
- )
1750
- raw = getattr(resp, "text", None) or str(resp)
1751
- parsed = _safe_parse_gemini_json(raw)
1752
- if parsed is not None:
1753
- claim = parsed.get("claim")
1754
- rationale = parsed.get("rationale", "")
1755
- if claim:
1756
- return sanitize_text(claim)[:400]
1757
- if rationale:
1758
- return f"Auto-generated (unverified): Image provided; no clear factual claim. Rationale: {sanitize_text(rationale)[:240]}"
1759
- return None
1760
- except Exception:
1761
- logger.exception("Gemini multimodal claim gen failed")
1762
- return None
1763
-
1764
- def gemini_extract_claims_from_text(article_text: str, max_claims: int = 3) -> List[Dict[str, str]]:
1765
- article_text = sanitize_text(article_text or "")
1766
- if not article_text:
1767
- return []
1768
- prompt = (
1769
- "You are a cautious fact-check assistant. From the following article text, extract up to "
1770
- f"{max_claims} concise, testable factual claims that a fact-checker could verify. "
1771
- "Return ONLY a single fenced JSON block (```json ... ```). The JSON object must have key `claims` "
1772
- "which is a list of objects with `claim` (short sentence <=140 chars) and `context` (short context snippet).\n\n"
1773
- "If the article contains no testable factual claims, return {\"claims\": []}.\n\n"
1774
- "Article:\n"
1775
- "```\n"
1776
- f"{article_text[:12000]}\n"
1777
- "```\n"
1778
- )
1779
- try:
1780
- resp = genai_client.models.generate_content(model=GENAI_MODEL, contents=[prompt])
1781
- raw = getattr(resp, "text", None) or str(resp)
1782
- j = _extract_json_from_text(raw)
1783
- if j:
1784
- parsed = json.loads(j)
1785
- claims = parsed.get("claims") or []
1786
- out = []
1787
- for c in claims[:max_claims]:
1788
- claim_text = sanitize_text(c.get("claim", ""))[:800]
1789
- context = sanitize_text(c.get("context", ""))[:400]
1790
- if claim_text:
1791
- out.append({"claim": claim_text, "context": context})
1792
- return out
1793
- except Exception:
1794
- logger.exception("Gemini extract claims failed")
1795
- try:
1796
- sents = re.split(r'(?<=[.!?])\s+', article_text)
1797
- out = []
1798
- for s in sents:
1799
- s_clean = s.strip()
1800
- if len(s_clean) > 30:
1801
- out.append({"claim": s_clean[:800], "context": s_clean[:400]})
1802
- if len(out) >= max_claims:
1803
- break
1804
- return out
1805
- except Exception:
1806
- return []
1807
-
1808
- def build_evidence_snippet(serpapi_web: dict, image_analysis: dict) -> str:
1809
- out = ""
1810
- try:
1811
- if serpapi_web and serpapi_web.get("result"):
1812
- organic = serpapi_web["result"].get("organic_results", []) or []
1813
- pieces = []
1814
- for r in organic[:8]:
1815
- pieces.append(f"{r.get('title','')} :: {r.get('snippet','')} :: {r.get('link','')}")
1816
- if pieces:
1817
- out += "WEB EVIDENCE:\n" + "\n".join(pieces)
1818
- if image_analysis and image_analysis.get("serpapi_reverse", {}).get("result"):
1819
- rorg = image_analysis["serpapi_reverse"]["result"].get("organic_results", []) or []
1820
- pieces = []
1821
- for r in rorg[:6]:
1822
- pieces.append(f"{r.get('title','')} :: {r.get('snippet','')} :: {r.get('link','')}")
1823
- if pieces:
1824
- out += "\nREVERSE IMAGE EVIDENCE:\n" + "\n".join(pieces)
1825
- except Exception:
1826
- logger.exception("Building evidence snippet failed")
1827
- return out
1828
-
1829
- def gemini_verify_claim(claim: str, serpapi_web: dict, image_analysis: dict) -> Dict[str, Any]:
1830
- if not claim:
1831
- return {"verdict": "Unclear", "overall": "No claim provided", "issues": [], "citations": []}
1832
- evidence_snippet = build_evidence_snippet(serpapi_web, image_analysis)
1833
- prompt = (
1834
- "You are a cautious fact-checker. Evaluate the claim and available evidence.\n"
1835
- "Return ONLY a single fenced JSON block with keys: verdict, overall, issues, citations.\n"
1836
- "verdict must be one of: True, False, Mixed, Unsupported, Unclear, Misleading.\n"
1837
- "citations should be a list of objects {source, snippet, url} if possible.\n\n"
1838
- f"Claim:\n{claim}\n\n"
1839
- f"Evidence (may be empty):\n{evidence_snippet}\n\n"
1840
- "Be concise. If you cannot reach a conclusion, use 'Unclear' or 'Unsupported'.\n"
1841
- )
1842
- try:
1843
- contents = [prompt]
1844
- if image_analysis and image_analysis.get("fetched"):
1845
- try:
1846
- img_bytes, _ = download_image_bytes(image_analysis["image_url"])
1847
- if img_bytes:
1848
- contents.append(Image.open(io.BytesIO(img_bytes)))
1849
- except Exception:
1850
- logger.exception("Attaching image to Gemini verify failed")
1851
- resp = genai_client.models.generate_content(model=GENAI_MODEL, contents=contents)
1852
- raw = getattr(resp, "text", None) or str(resp)
1853
- parsed = _safe_parse_gemini_json(raw)
1854
- if parsed:
1855
- return parsed
1856
- try:
1857
- return json.loads(raw)
1858
- except Exception:
1859
- logger.warning("Gemini verify produced unparsable output: %s", raw[:400])
1860
- return {"verdict": "Unclear", "overall": raw[:400], "issues": ["unparsed"], "citations": []}
1861
- except Exception:
1862
- logger.exception("Gemini verify failed")
1863
- return {"verdict": "Unclear", "overall": "Gemini failure", "issues": ["gemini_failure"], "citations": []}
1864
-
1865
- # ------------------------
1866
- # HF zero-shot
1867
- # ------------------------
1868
- def hf_zero_shot_classify(claim: str) -> Dict[str, Any]:
1869
- if not zero_shot or not claim:
1870
- return {"error": "hf-unavailable"}
1871
- try:
1872
- return zero_shot(claim, candidate_labels=CANDIDATE_LABELS, multi_label=False)
1873
- except Exception:
1874
- logger.exception("HF zero-shot failed")
1875
- return {"error": "hf-failed"}
1876
-
1877
- # ------------------------
1878
- # Aggregator & combiner
1879
- # ------------------------
1880
- def aggregate_search_results(serpapi_result: dict) -> Dict[str, Any]:
1881
- if not serpapi_result or not serpapi_result.get("available") or not serpapi_result.get("result"):
1882
- return {"evidence": [], "consensus": {"contradicts_claim": False, "top_trust_avg": 0.5, "top_domains": {}}, "raw_snippets": ""}
1883
- res = serpapi_result["result"]
1884
- organic = res.get("organic_results", []) or []
1885
- evidence = []
1886
- domains = {}
1887
- for r in organic[:12]:
1888
- title = r.get("title") or ""
1889
- snippet = r.get("snippet") or ""
1890
- link = r.get("link") or r.get("displayed_link") or ""
1891
- domain = _domain_from_url(link)
1892
- trust = SOURCE_TRUST.get(domain, 0.6)
1893
- evidence.append({"title": title, "snippet": snippet, "link": link, "domain": domain, "trust": round(trust, 2)})
1894
- if domain:
1895
- domains[domain] = domains.get(domain, 0) + 1
1896
- top3 = evidence[:3]
1897
- top_trust_avg = sum([e["trust"] for e in top3]) / len(top3) if top3 else 0.5
1898
- return {"evidence": evidence,
1899
- "consensus": {"contradicts_claim": False, "top_trust_avg": round(top_trust_avg, 2), "top_domains": domains},
1900
- "raw_snippets": " ".join([e["title"] + " " + e["snippet"] for e in evidence])[:4000]}
1901
-
1902
- def _map_gemini_verdict_to_score(v: str) -> float:
1903
- if not v:
1904
- return 0.0
1905
- vv = v.lower()
1906
- if vv == "true":
1907
- return 1.0
1908
- if vv == "false":
1909
- return -1.0
1910
- if vv in ("mixed", "misleading"):
1911
- return -0.2
1912
- if vv in ("unsupported", "unclear"):
1913
- return 0.0
1914
- return 0.0
1915
-
1916
- def _map_hf_label_to_score(hf_result: dict) -> float:
1917
- try:
1918
- if not hf_result or "labels" not in hf_result:
1919
- return 0.0
1920
- top = hf_result["labels"][0].lower()
1921
- if top == "true":
1922
- return 0.6
1923
- if top == "false":
1924
- return -0.6
1925
- if top == "unsupported":
1926
- return -0.7
1927
- if top == "misleading":
1928
- return -0.4
1929
- return 0.0
1930
- except Exception:
1931
- return 0.0
1932
-
1933
- def combine_signals(gemini_verdict: dict, hf_result: dict, evidence_agg: dict) -> Dict[str, Any]:
1934
- """
1935
- Revised combiner that maps to exactly: True | False | Misleading | Unclear
1936
- """
1937
- reasons = []
1938
- g_ver = (gemini_verdict or {}).get("verdict", "Unclear")
1939
- g_overall = (gemini_verdict or {}).get("overall", "")
1940
- g_issues = (gemini_verdict or {}).get("issues", []) or []
1941
-
1942
- top_trust = evidence_agg.get("consensus", {}).get("top_trust_avg", 0.5)
1943
- evidence_count = len(evidence_agg.get("evidence", []))
1944
-
1945
- g_score = _map_gemini_verdict_to_score(g_ver)
1946
- hf_score = _map_hf_label_to_score(hf_result)
1947
- trust_norm = (top_trust - 0.5) * 2.0
1948
-
1949
- # weights
1950
- w_g = 0.55
1951
- w_h = 0.2
1952
- w_e = 0.25
1953
-
1954
- final_score = w_g * g_score + w_h * hf_score + w_e * trust_norm
1955
- confidence = min(0.99, max(0.05, 0.4 + abs(final_score) * 0.6))
1956
-
1957
- # Decision thresholds — tuned conservatively.
1958
- # final_score roughly in [-1,1]
1959
- if final_score >= 0.45:
1960
- label = "True"
1961
- reasons.append("Aggregated signals indicate likely truth")
1962
- elif final_score <= -0.45:
1963
- label = "False"
1964
- reasons.append("Aggregated signals indicate likely falsehood")
1965
- else:
1966
- # borderline: detect likely misleading if models point negative and evidence ambiguous
1967
- if evidence_count >= 2 and hf_score < 0 and final_score < 0.2:
1968
- label = "Misleading"
1969
- reasons.append("Evidence / classifier suggest partial inaccuracy or omission")
1970
- else:
1971
- label = "Unclear"
1972
- reasons.append("Insufficient agreement between models and web evidence")
1973
-
1974
- # If multiple high-trust sources strongly corroborate -> force True
1975
- if evidence_count >= 2 and top_trust >= 0.7:
1976
- if label in ("Unclear", "Misleading"):
1977
- reasons.append("Multiple high-trust outlets corroborate the core event")
1978
- label = "True"
1979
-
1980
- # Add Gemini notes
1981
- if g_issues:
1982
- reasons.extend(g_issues if isinstance(g_issues, list) else [str(g_issues)])
1983
- if g_overall:
1984
- reasons.append(f"Gemini note: {g_overall[:240]}")
1985
-
1986
- return {"final_verdict": label, "confidence": round(confidence, 3), "reasons": reasons, "final_score": round(final_score, 3)}
1987
-
1988
- # ------------------------
1989
- # Q/A formatting (user-friendly)
1990
- # ------------------------
1991
- def _trust_score_pct_from_final_score(final_score: float) -> int:
1992
- """Map final_score (-1..1) to 0..100; clamp."""
1993
- try:
1994
- pct = int((final_score + 1.0) * 50.0)
1995
- pct = max(0, min(100, pct))
1996
- return pct
1997
- except Exception:
1998
- return 50
1999
-
2000
- def format_user_friendly_explanation(report_entry: dict) -> str:
2001
- """
2002
- Return exactly three Q&A items:
2003
- 1) Why did we reach this verdict? -> direct answer (short reasons only)
2004
- 2) How was it verified? -> sources / checks used
2005
- 3) What should you do next? -> actionable next steps
2006
- This version is robust to reasons being strings or dicts.
2007
- """
2008
-
2009
- def _reason_to_text(r) -> str:
2010
- # Normalize a single reason entry (string/dict/other) to a short string
2011
- try:
2012
- if r is None:
2013
- return ""
2014
- if isinstance(r, str):
2015
- return r.strip()
2016
- if isinstance(r, dict):
2017
- # Prefer common keys if present
2018
- for key in ("reason", "message", "detail", "issue", "note"):
2019
- if key in r and r[key]:
2020
- return str(r[key])[:300]
2021
- # Fallback: stringify limited JSON
2022
- try:
2023
- return json.dumps(r, ensure_ascii=False)[:300]
2024
- except Exception:
2025
- return str(r)[:300]
2026
- # Fallback for other types
2027
- return str(r)[:300]
2028
- except Exception:
2029
- return ""
2030
-
2031
- claim = report_entry.get("claim", "").strip() or "(no claim provided)"
2032
-
2033
- # Q1: Why — build a short reason summary from report_entry["reasons"]
2034
- reasons = report_entry.get("reasons", []) or []
2035
- if isinstance(reasons, (str, dict)):
2036
- reasons = [reasons]
2037
-
2038
- # Convert up to 3 reasons to text
2039
- reason_texts = []
2040
- for r in reasons[:3]:
2041
- t = _reason_to_text(r)
2042
- if t:
2043
- reason_texts.append(t)
2044
- if reason_texts:
2045
- reasons_text = "; ".join(reason_texts)
2046
- else:
2047
- # fallback to Gemini note or generic text
2048
- gem_notes = (report_entry.get("gemini_verdict") or {}).get("overall", "")
2049
- reasons_text = gem_notes[:300] if gem_notes else "No strong model reasons were returned."
2050
-
2051
- q1 = f"Q1: Why did we reach this verdict?\nA: {reasons_text}"
2052
-
2053
- # Q2: How was it verified? — list up to 3 top sources and performed checks
2054
- evidence_agg = report_entry.get("evidence_agg", {}) or {}
2055
- evidence = evidence_agg.get("evidence", []) or []
2056
- top_sources = []
2057
- for e in evidence[:3]:
2058
- domain = e.get("domain") or _domain_from_url(e.get("link") or "")
2059
- title = e.get("title") or ""
2060
- link = e.get("link") or ""
2061
- if link:
2062
- top_sources.append(f"{domain}: {title[:120]} ({link})")
2063
- else:
2064
- top_sources.append(f"{domain}: {title[:120]}")
2065
-
2066
- top_sources_text = "\n- ".join(top_sources) if top_sources else "No strong web sources found."
2067
-
2068
- checks = []
2069
- if report_entry.get("gemini_verdict"):
2070
- checks.append("Gemini model analysis")
2071
- hf = report_entry.get("hf_classifier")
2072
- if hf and isinstance(hf, dict) and "labels" in hf:
2073
- checks.append("HF zero-shot classifier")
2074
- if report_entry.get("image_analysis") and report_entry["image_analysis"].get("fetched"):
2075
- checks.append("Image analysis (EXIF / ELA / pHash / reverse-image)")
2076
- # Normalize phishing_analysis to a dict and ensure nested fields are dicts
2077
- phish = report_entry.get("phishing_analysis") or {}
2078
- sb = (phish.get("safe_browsing") or {})
2079
- vt = (phish.get("virustotal") or {})
2080
- phish_notes = []
2081
- try:
2082
- if sb and sb.get("safe") is False:
2083
- phish_notes.append("Safe Browsing flagged the site")
2084
- if vt and vt.get("safe") is False:
2085
- phish_notes.append("VirusTotal flagged the site")
2086
- if not phish_notes and (sb or vt):
2087
- phish_notes.append("Phishing checks performed (no clear flags)")
2088
- except Exception:
2089
- pass
2090
- if phish_notes:
2091
- checks.append("; ".join(phish_notes))
2092
-
2093
- checks_text = ", ".join(checks) if checks else "Model and web-snippet analysis (no special checks detected)."
2094
-
2095
- q2_lines = [
2096
- "Q2: How was it verified?",
2097
- "A: Verified by:",
2098
- f"- Top web references (up to 3):\n- {top_sources_text}" if top_sources else f"- Top web references: {top_sources_text}",
2099
- f"- Automated checks: {checks_text}"
2100
- ]
2101
- q2 = "\n".join(q2_lines)
2102
-
2103
- # Q3: What should you do next? — concise, actionable advice
2104
- next_steps = []
2105
- if top_sources:
2106
- next_steps.append("Read the listed sources for full context and check publication dates.")
2107
- next_steps.append("Cross-check with official channels (government, company, or primary source).")
2108
- else:
2109
- next_steps.append("No strong sources found — seek independent confirmation from trusted outlets before sharing.")
2110
- next_steps.append("If this concerns safety or fraud, check official alerts or regulator pages.")
2111
-
2112
- # If phishing checks flagged the site, emphasize safety first
2113
- # Use the normalized sb/vt dicts to avoid NoneType.get errors
2114
- if (sb.get("safe") is False) or (vt.get("safe") is False):
2115
- next_steps.insert(0, "Do NOT click links from this page; treat it as potentially unsafe and report it.")
2116
-
2117
- q3 = "Q3: What should you do next?\nA: " + " ".join([f"- {s}" for s in next_steps])
2118
-
2119
- return f"{q1}\n\n{q2}\n\n{q3}"
2120
-
2121
- # ------------------------
2122
- # Robust fetch_article_text_from_url (upgraded)
2123
- # - SSRF-protected
2124
- # - tries JSON-LD extraction first (works for MSN and many publishers)
2125
- # - then OpenGraph/meta description
2126
- # - then readability-lxml (optional)
2127
- # - then <article>/<main> paragraphs
2128
- # - returns (article_text, headline)
2129
- # ------------------------
2130
- def _extract_jsonld_from_soup(soup: BeautifulSoup) -> Optional[dict]:
2131
- try:
2132
- scripts = soup.find_all("script", type="application/ld+json")
2133
- for s in scripts:
2134
- try:
2135
- txt = s.string or s.get_text()
2136
- if not txt or not txt.strip():
2137
- continue
2138
- parsed = json.loads(txt)
2139
- # parsed may be dict or list
2140
- items = parsed if isinstance(parsed, list) else [parsed]
2141
- for item in items:
2142
- # sometimes nested graph
2143
- if isinstance(item, dict) and item.get("@type") in ("NewsArticle", "Article", "Report"):
2144
- return item
2145
- # handle @graph
2146
- if isinstance(item, dict) and "@graph" in item and isinstance(item["@graph"], list):
2147
- for g in item["@graph"]:
2148
- if isinstance(g, dict) and g.get("@type") in ("NewsArticle", "Article", "Report"):
2149
- return g
2150
- except Exception:
2151
- continue
2152
- except Exception:
2153
- pass
2154
- return None
2155
-
2156
- def fetch_article_text_from_url(url: str) -> tuple[str, str]:
2157
- """
2158
- Robust article extractor:
2159
- - Blocks private hosts (SSRF protection)
2160
- - Tries JSON-LD extraction (works for MSN and many publishers)
2161
- - Falls back to OpenGraph/meta, readability-lxml, <article>/<main> paragraph extraction,
2162
- and finally meta description.
2163
- Returns: (article_text, headline)
2164
- """
2165
- try:
2166
- if not _is_host_public(url):
2167
- logger.warning("Blocked fetch_article_text_from_url for private host: %s", url)
2168
- return "", ""
2169
-
2170
- headers = {"User-Agent": "newsorchestra/1.0"}
2171
- # small retry loop for transient issues
2172
- html = ""
2173
- for attempt in range(2):
2174
- try:
2175
- r = requests.get(url, timeout=10, headers=headers)
2176
- r.raise_for_status()
2177
- html = r.text
2178
- break
2179
- except requests.RequestException as e:
2180
- logger.debug("fetch attempt %s failed for %s: %s", attempt + 1, url, e)
2181
- html = ""
2182
- if attempt == 1:
2183
- raise
2184
-
2185
- if not html:
2186
- return "", ""
2187
-
2188
- soup = BeautifulSoup(html, "html.parser")
2189
-
2190
- # 1) JSON-LD extraction (best for MSN and other modern publishers)
2191
- jld = _extract_jsonld_from_soup(soup)
2192
- if jld:
2193
- # Many JSON-LD objects may have articleBody or description and headline
2194
- headline = jld.get("headline") or jld.get("name") or ""
2195
- body = jld.get("articleBody") or jld.get("description") or ""
2196
- # Some MSN entries store text as a list in "articleBody" or in "mainEntityOfPage"
2197
- if isinstance(body, list):
2198
- body = " ".join([str(x) for x in body if x])
2199
- if body:
2200
- return sanitize_text(str(body)), sanitize_text(str(headline) or "")
2201
-
2202
- # 2) OpenGraph / meta
2203
- og_title = (soup.find("meta", property="og:title") or {}).get("content")
2204
- og_desc = (soup.find("meta", property="og:description") or {}).get("content")
2205
- if og_desc:
2206
- return sanitize_text(og_desc), sanitize_text(og_title or "")
2207
-
2208
- # 3) readability fallback if available
2209
- try:
2210
- from readability import Document # readability-lxml, optional
2211
- doc = Document(html)
2212
- article_html = doc.summary()
2213
- headline = doc.short_title() or ""
2214
- soup2 = BeautifulSoup(article_html, "html.parser")
2215
- paras = [p.get_text(" ", strip=True) for p in soup2.find_all("p")]
2216
- article_text = "\n\n".join([p for p in paras if len(p) > 30])
2217
- if article_text:
2218
- return article_text, headline
2219
- except Exception:
2220
- logger.debug("readability extraction not available or failed; using BeautifulSoup fallback")
2221
-
2222
- # 4) BeautifulSoup fallback: prefer <article> or <main>
2223
- article_tag = soup.find("article")
2224
- if article_tag:
2225
- paras = [p.get_text(" ", strip=True) for p in article_tag.find_all("p")]
2226
- else:
2227
- main = soup.find("main") or soup.find(id="main") or soup.find(class_="article") or soup
2228
- paras = [p.get_text(" ", strip=True) for p in main.find_all("p")]
2229
- article_text = "\n\n".join([p for p in paras if len(p) > 40])
2230
- headline = soup.title.get_text(strip=True) if soup.title else ""
2231
-
2232
- # 5) fallback to meta description if no body text
2233
- if not article_text:
2234
- meta = soup.find("meta", {"name": "description"}) or soup.find("meta", {"property": "og:description"})
2235
- if meta and meta.get("content"):
2236
- article_text = meta.get("content", "")
2237
-
2238
- return article_text or "", headline or ""
2239
- except Exception:
2240
- logger.exception("fetch_article_text_from_url failed")
2241
- return "", ""
2242
-
2243
- # ------------------------
2244
- # on_analyze handler (upgraded)
2245
- # - uses SERP fallback for snippets when article extraction fails
2246
- # - surfaces QA fallback note
2247
- # - infers phishing_tag for frontend convenience
2248
- # ------------------------
2249
- def on_analyze(text_or_url: str, image_url: str, run_serp: bool, track_spread: bool = False):
2250
- try:
2251
- txt = (text_or_url or "").strip()
2252
- is_url = bool(re.match(r"^https?://", txt))
2253
- article_text, headline, url = "", "", None
2254
- qa_fallback_note = ""
2255
-
2256
- if is_url:
2257
- url = txt
2258
- article_text, headline = fetch_article_text_from_url(txt)
2259
- # If extraction failed, compose a SERP-snippet fallback (if available)
2260
- if not article_text and headline and run_serp and SERPAPI_KEY:
2261
- serpapi_result = serpapi_web_search(headline, num=8)
2262
- snippets = [res.get("snippet", "") for res in serpapi_result.get("result", {}).get("organic_results", [])]
2263
- serp_text = "\n\n".join([s for s in snippets if s])[:3000]
2264
- if serp_text:
2265
- article_text = f"(SERP fallback - extracted snippets for headline: {headline})\n\n{serp_text}"
2266
- qa_fallback_note = (
2267
- "Note: we couldn't extract full article text from the URL. "
2268
- "Analysis used SERP snippets as a fallback — verify date/location in original sources."
2269
- )
2270
- else:
2271
- # minimal fallback so claim extraction still has content
2272
- article_text = f"(No extractable article text) Headline: {headline}"
2273
- qa_fallback_note = (
2274
- "Note: no article text could be extracted; analysis used the page headline only."
2275
- )
2276
- else:
2277
- article_text = txt or ""
2278
-
2279
- claim = "" # Orchestrator will generate/extract claims
2280
- report = ORCH.run(
2281
- claim_text=claim,
2282
- article_text=article_text,
2283
- url=url,
2284
- image_url=image_url or None,
2285
- run_serpapi=run_serp,
2286
- )
2287
-
2288
- # optionally run spread-tracker if asked
2289
- spread_report = {}
2290
- if track_spread:
2291
- # pick best text to track: extracted claims, then headline, then article snippet
2292
- candidate = None
2293
- if report.get("reports"):
2294
- first_claim = report["reports"][0].get("claim")
2295
- if first_claim:
2296
- candidate = first_claim
2297
- if not candidate:
2298
- candidate = headline or (article_text[:300] if article_text else None)
2299
- if candidate:
2300
- try:
2301
- spread_report = track_spread_for_claim(candidate, platforms=["web", "x", "facebook", "instagram"], max_per_platform=6)
2302
- # attach it to the report
2303
- report["spread_tracking"] = spread_report
2304
- except Exception:
2305
- logger.exception("spread tracker failed")
2306
- spread_report = {}
2307
-
2308
- extracted_claims = [r.get("claim") for r in report.get("reports", [])]
2309
- # Get the first report's QA summary for quick display (if multiple claims you can adapt)
2310
- qa_text = ""
2311
- if report.get("reports"):
2312
- # prefer the report QA summary; if missing and we have a serp fallback note, surface it
2313
- qa_text = report["reports"][0].get("qa_summary", "") or ""
2314
- if (not qa_text) and qa_fallback_note:
2315
- qa_text = qa_fallback_note
2316
-
2317
- # attach a phishing_tag to the first report and to the returned phishing object
2318
- summary_phish_flag = report.get("summary", {}).get("phishing_flag")
2319
- if summary_phish_flag is True:
2320
- phishing_tag = "Unsafe"
2321
- elif summary_phish_flag is False:
2322
- phishing_tag = "Safe"
2323
- else:
2324
- # if phish data exists per-report, try to infer
2325
- first_phish = report["reports"][0].get("phishing_analysis", {}) or {}
2326
- sb = (first_phish.get("safe_browsing") or {})
2327
- vt = (first_phish.get("virustotal") or {})
2328
- inferred = None
2329
- try:
2330
- if sb and sb.get("safe") is False:
2331
- inferred = "Unsafe"
2332
- elif vt and vt.get("safe") is False:
2333
- inferred = "Unsafe"
2334
- elif sb and sb.get("safe") is True and vt and vt.get("safe") is True:
2335
- inferred = "Safe"
2336
- except Exception:
2337
- inferred = None
2338
- phishing_tag = inferred or "Unknown"
2339
-
2340
- # set phishing_tag in the report for frontend convenience
2341
- try:
2342
- report["reports"][0]["phishing_tag"] = phishing_tag
2343
- except Exception:
2344
- pass
2345
-
2346
- phish = report.get("reports", [{}])[0].get("phishing_analysis", {}) or {}
2347
- # ensure returned phish object includes tag
2348
- try:
2349
- phish = dict(phish)
2350
- phish["phishing_tag"] = report.get("reports", [{}])[0].get("phishing_tag", "Unknown")
2351
- except Exception:
2352
- phish = phish
2353
-
2354
- # include a short note about spread-tracker usage when requested
2355
- if track_spread:
2356
- qa_text = (qa_text or "") + ("\n\nSpread-tracker: results attached to report['spread_tracking']. Use sparingly; scraping is best-effort.")
2357
-
2358
- return report, qa_text, extracted_claims, phish
2359
-
2360
- except Exception:
2361
- logger.exception("on_analyze failed")
2362
- return {"error": traceback.format_exc()}, "", [], {}
2363
-
2364
- def verdict_to_str(v):
2365
- if isinstance(v, bool):
2366
- return "True" if v else "False"
2367
- if not v:
2368
- return "Unclear"
2369
- return str(v).strip()
2370
-
2371
- # ------------------------
2372
- # Orchestrator
2373
- # ------------------------
2374
- class Orchestrator:
2375
- def run(self, claim_text: str, article_text: Optional[str], url: Optional[str], image_url: Optional[str],
2376
- run_serpapi: bool = True) -> dict:
2377
- claim_text = sanitize_text(claim_text or "")
2378
- article_text = sanitize_text(article_text or "")
2379
-
2380
- image_analysis = analyze_image_url(image_url) if image_url else None
2381
- # Always normalize phish_report to a dict to avoid NoneType.get errors later
2382
- phish_report = phishing_checks(url) if url else {}
2383
- if phish_report is None:
2384
- phish_report = {}
2385
-
2386
- serpapi_result = {"available": False}
2387
- if run_serpapi and SERPAPI_KEY:
2388
- q = claim_text or article_text or url or image_url
2389
- if q:
2390
- serpapi_result = serpapi_web_search(q, num=8)
2391
-
2392
- # prepare claims
2393
- claims_to_check = []
2394
- if article_text:
2395
- claims_struct = gemini_extract_claims_from_text(article_text, max_claims=3)
2396
- if claims_struct:
2397
- claims_to_check = [c["claim"] for c in claims_struct if c.get("claim")]
2398
- else:
2399
- paras = [p for p in article_text.split("\n") if p.strip()]
2400
- if paras:
2401
- claims_to_check = [paras[0][:800]]
2402
- else:
2403
- if claim_text:
2404
- claims_to_check = [claim_text]
2405
- elif image_url:
2406
- auto = None
2407
- try:
2408
- auto = gemini_generate_claim_from_image(image_url)
2409
- except Exception:
2410
- logger.exception("gemini image claim gen failed")
2411
- if auto:
2412
- claims_to_check = [auto]
2413
- else:
2414
- ia = image_analysis or analyze_image_url(image_url)
2415
- ela = ia.get("ela", {}).get("ela_score") if ia else None
2416
- phash = ia.get("phash", {}).get("phash") if ia else None
2417
- serp_note = ""
2418
- if ia and ia.get("serpapi_reverse", {}).get("available"):
2419
- serp_note = " Reverse-image search results available."
2420
- fallback_claim = f"Auto-generated (unverified): Image provided ({image_url}). Content unclear.{(' ELA=' + str(ela)) if ela else ''}{(' phash=' + str(phash)) if phash else ''}{serp_note}"
2421
- claims_to_check = [fallback_claim]
2422
-
2423
- # deduplicate
2424
- unique_claims = []
2425
- seen = set()
2426
- for c in claims_to_check:
2427
- if not c:
2428
- continue
2429
- key = c.strip().lower()
2430
- if key in seen:
2431
- continue
2432
- seen.add(key)
2433
- unique_claims.append(c)
2434
- # --- NEW: collect verdicts per claim for modal accuracy ---
2435
- verdicts_per_claim = []
2436
-
2437
- reports = []
2438
- for claim in unique_claims:
2439
- serpapi_for_claim = serpapi_result
2440
- if run_serpapi and SERPAPI_KEY:
2441
- try:
2442
- serpapi_for_claim = serpapi_web_search(claim, num=6)
2443
- except Exception:
2444
- serpapi_for_claim = serpapi_result
2445
-
2446
- hf_result = hf_zero_shot_classify(claim)
2447
- gemini_verdict = gemini_verify_claim(claim, serpapi_for_claim, image_analysis)
2448
- evidence_agg = aggregate_search_results(serpapi_for_claim)
2449
- combined = combine_signals(gemini_verdict, hf_result, evidence_agg)
2450
-
2451
- # --- MEDIA AUTHENTICITY OVERRIDE ---
2452
- # If the image is indicated as AI-generated / fabricated by model issues, verdict text,
2453
- # or reverse image/web evidence, force final verdict to False (image is fake).
2454
- try:
2455
- media_flagged_fake = False
2456
- # 1) check gemini_verdict issues (structured)
2457
- g_issues = (gemini_verdict or {}).get("issues", []) or []
2458
- if isinstance(g_issues, (list, tuple)):
2459
- for it in g_issues:
2460
- try:
2461
- if isinstance(it, dict):
2462
- typ = str(it.get("type", "")).lower()
2463
- desc = str(it.get("description", "")).lower()
2464
- if "ai" in typ or "ai_generation" in typ or "ai-generation" in desc or "ai-generated" in desc or "fabricat" in desc or "deepfake" in desc or "generated image" in desc:
2465
- media_flagged_fake = True
2466
- break
2467
- elif isinstance(it, str):
2468
- low = it.lower()
2469
- if any(k in low for k in ("ai-generated", "fabricat", "fake", "deepfake", "generated image", "computer-generated")):
2470
- media_flagged_fake = True
2471
- break
2472
- except Exception:
2473
- continue
2474
-
2475
- # 2) check gemini_verdict overall text
2476
- overall_text = str((gemini_verdict or {}).get("overall", "") or "").lower()
2477
- if any(k in overall_text for k in ("ai-generated", "ai generated", "fabricat", "deepfake", "computer-generated", "generated image", "not authentic", "fake")):
2478
- media_flagged_fake = True
2479
-
2480
- # 3) check reverse-image search (SERP) snippets/titles for keywords
2481
- if image_analysis and image_analysis.get("serpapi_reverse", {}).get("result"):
2482
- rlist = image_analysis["serpapi_reverse"]["result"].get("organic_results", []) or []
2483
- for r in rlist:
2484
- txt = (str(r.get("title", "") or "") + " " + str(r.get("snippet", "") or "")).lower()
2485
- if any(k in txt for k in ("ai-generated", "generated image", "computer-generated", "fake image", "deepfake", "ai image", "fabricat")):
2486
- media_flagged_fake = True
2487
- break
2488
-
2489
- # 4) optionally check aggregated evidence titles/snippets
2490
- for e in evidence_agg.get("evidence", [])[:6]:
2491
- try:
2492
- txt = (e.get("title", "") or "") + " " + (e.get("snippet", "") or "")
2493
- low = txt.lower()
2494
- if any(k in low for k in ("ai-generated", "generated image", "computer-generated", "fake image", "deepfake", "ai image", "fabricat")):
2495
- media_flagged_fake = True
2496
- break
2497
- except Exception:
2498
- continue
2499
-
2500
- if media_flagged_fake:
2501
- # Apply override: force False
2502
- combined["final_verdict"] = "False"
2503
- # make confidence stronger (but bounded)
2504
- combined["confidence"] = max(combined.get("confidence", 0.4), 0.6)
2505
- # prepend a clear reason dict
2506
- reasons = combined.get("reasons", []) or []
2507
- media_reason = {
2508
- "type": "AI_GENERATION",
2509
- "description": "Media-authenticity override: image appears to be AI-generated/fabricated per model findings or trusted fact-checks."
2510
- }
2511
- # ensure not duplicate
2512
- try:
2513
- if not any(isinstance(r, dict) and r.get("type") == "AI_GENERATION" for r in reasons):
2514
- reasons.insert(0, media_reason)
2515
- except Exception:
2516
- reasons.insert(0, media_reason)
2517
- combined["reasons"] = reasons
2518
- except Exception:
2519
- logger.exception("Media override check failed")
2520
- # --- NEW: append verdicts for modal accuracy ---
2521
- verdicts_per_claim.append([
2522
- verdict_to_str(gemini_verdict.get("verdict")) if gemini_verdict else "Unclear",
2523
- verdict_to_str(hf_result.get("labels", ["Unclear"])[0]) if hf_result and isinstance(hf_result, dict) and "labels" in hf_result else "Unclear",
2524
- verdict_to_str(combined.get("final_verdict"))
2525
- ])
2526
-
2527
-
2528
- # normalized trust score for display
2529
- trust_pct = _trust_score_pct_from_final_score(combined.get("final_score", 0.0))
2530
-
2531
- report_entry = {
2532
- "claim": claim,
2533
- "context": article_text[:400] if article_text else "",
2534
- "image_analysis": image_analysis,
2535
- "hf_classifier": hf_result,
2536
- "gemini_verdict": gemini_verdict,
2537
- "serpapi_result": serpapi_for_claim,
2538
- "evidence_agg": evidence_agg,
2539
- "phishing_analysis": phish_report,
2540
- "final_verdict": combined["final_verdict"],
2541
- "confidence": combined["confidence"],
2542
- "reasons": combined.get("reasons", []),
2543
- "final_score": combined.get("final_score"),
2544
- "trust_score_pct": trust_pct
2545
- }
2546
-
2547
- # user-friendly Q&A summary per claim
2548
- report_entry["qa_summary"] = format_user_friendly_explanation(report_entry)
2549
-
2550
- reports.append(report_entry)
2551
- # --- NEW: compute modal accuracy if ground truth available ---
2552
- ground_truth_labels = [verdict_to_str(r.get("truth_label")) for r in reports if r.get("truth_label")]
2553
-
2554
- if ground_truth_labels:
2555
- modal_acc = compute_modal_accuracy(verdicts_per_claim, ground_truth_labels)
2556
- else:
2557
- modal_acc = 0.0
2558
-
2559
- if not ground_truth_labels:
2560
- ground_truth_labels = [verdict_to_str(r["final_verdict"]) for r in reports]
2561
-
2562
-
2563
-
2564
- summary = {"counts": {}, "dominant_verdict": "Unclear", "modal_accuracy": modal_acc}
2565
-
2566
- for r in reports:
2567
- v = r["final_verdict"]
2568
- summary["counts"][v] = summary["counts"].get(v, 0) + 1
2569
- if reports:
2570
- dominant = max(summary["counts"].items(), key=lambda x: x[1])[0]
2571
- summary["dominant_verdict"] = dominant
2572
-
2573
- # Add phishing flag to summary if any phishing check flagged the site
2574
- # Use safe normalization so nested None values can't cause AttributeError
2575
- if phish_report:
2576
- sb = (phish_report or {}).get("safe_browsing") or {}
2577
- vt = (phish_report or {}).get("virustotal") or {}
2578
- sb_safe = sb.get("safe")
2579
- vt_safe = vt.get("safe")
2580
- summary["phishing_flag"] = True if (sb_safe is False or vt_safe is False) else False
2581
- else:
2582
- summary["phishing_flag"] = False
2583
-
2584
- return {
2585
- "claims_analyzed": len(reports),
2586
- "reports": reports,
2587
- "summary": summary,
2588
- "url": url,
2589
- "timestamp": time.time()
2590
- }
2591
-
2592
- ORCH = Orchestrator()
2593
-
2594
- # ------------------------
2595
- # Gradio UI
2596
- # ------------------------
2597
- title = "NewsOrchestra — Gemini multimodal verifier (upgraded)"
2598
- description = "Gemini required. Set GEMINI_API_KEY. SerpApi optional. SAFE_BROWSING_KEY/VIRUSTOTAL_KEY optional."
2599
-
2600
- with gr.Blocks(title=title) as demo2:
2601
- gr.Markdown(f"# {title}\n\n{description}")
2602
-
2603
- with gr.Row():
2604
- article_text_in = gr.Textbox(lines=6, label="Article text (optional)")
2605
- article_url_in = gr.Textbox(lines=1, label="Article URL (optional)")
2606
- image_url_in = gr.Textbox(lines=1, label="Image URL (optional)")
2607
- run_serp_cb = gr.Checkbox(label="Run SerpApi (requires SERPAPI_KEY)", value=bool(SERPAPI_KEY))
2608
- track_spread_cb = gr.Checkbox(label="Track spread across web/X/Facebook/Instagram (free dork-scrape)", value=False)
2609
- analyze_btn = gr.Button("Analyze")
2610
-
2611
- with gr.Row():
2612
- out_json = gr.JSON(label="Full Report (JSON)", visible=True)
2613
- out_qa = gr.Textbox(label="Q&A Summary (user-friendly)", lines=12)
2614
- out_claims = gr.JSON(label="Extracted Claims")
2615
- out_phish = gr.JSON(label="Phishing Analysis")
2616
-
2617
- analyze_btn.click(
2618
- fn=_gradio_on_analyze,
2619
- inputs=[article_text_in, article_url_in, image_url_in, run_serp_cb, track_spread_cb],
2620
- outputs=[out_json, out_qa, out_claims, out_phish],
2621
- show_progress=True
2622
- )
2623
- if __name__ == "__main__":
2624
- demo2.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
 
11
  from urllib.parse import urlparse
12
  from functools import lru_cache
13
  from collections import Counter
 
 
 
 
14
 
15
  import requests
16
  from bs4 import BeautifulSoup
 
68
  # ------------------------
69
  # Helpers
70
  # ------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def compute_modal_accuracy(verdicts: list, true_labels: list) -> float:
72
  """
73
  verdicts: list of lists of model outputs per claim, e.g. [["True","False"], ["True","True"], ...]
 
1231
  )
1232
 
1233
  if __name__ == "__main__":
1234
+ demo2.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))