Spaces:

mdAmin313
/

atlas

Paused

App Files Files Community

mdAmin313 commited on Aug 27, 2025

Commit

1b1b755

1 Parent(s): 681fdba

Initial commit: AI misinformation detector

Browse files

Files changed (2) hide show

.env +4 -0
app.py +253 -197

.env ADDED Viewed

	@@ -0,0 +1,4 @@

+GNEWS_API_KEY = "c41717a7b25455cd0937016c539e72d5"
+NEWSORG_API_KEY ="9067f24c056541fd937a455293d9ace3"
+OPENAI_API_KEY = ""
+GEMINI_API_KEY = "AIzaSyBmzG18sh5yMNdDGonfquo5B7-HEkMewro"

app.py CHANGED Viewed

@@ -1,58 +1,50 @@
 import os
 import re
-import requests
 from typing import List, Dict, Any, Optional
-from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-import gradio as gr
 from bs4 import BeautifulSoup
-# ---------------- ENV ----------------
-GNEWS_API_KEY = "c41717a7b25455cd0937016c539e72d5"
-NEWSORG_API_KEY = "9067f24c056541fd937a455293d9ace3"
-GEMINI_API_KEY = "AIzaSyBmzG18sh5yMNdDGonfquo5B7-HEkMewro"
-GEMINI_CX = "727386fd4ef37425d"
-# ---------------- Lazy-load models ----------------
-SENTE_MODEL = None
-ZS_PIPE = None
-def get_sentence_model():
-    global SENTE_MODEL
-    if SENTE_MODEL is None:
-        from sentence_transformers import SentenceTransformer
-        SENTE_MODEL = SentenceTransformer("all-mpnet-base-v2")
-    return SENTE_MODEL
-def get_zs_pipe():
-    global ZS_PIPE
-    if ZS_PIPE is None:
-        from transformers import pipeline
-        ZS_PIPE = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
-    return ZS_PIPE
-# ---------------- FastAPI ----------------
-app = FastAPI()
-app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"])
 class VerifyRequest(BaseModel):
     text: str
-    mode: Optional[str] = "fast"
-# ---------------- Helpers ----------------
-TRUSTED_DOMAINS = {
-    "bbc.co.uk","bbc.com","cnn.com","nytimes.com","reuters.com","apnews.com",
-    "theguardian.com","npr.org","washingtonpost.com","wsj.com","usatoday.com",
-    "bloomberg.com","aljazeera.com","msnbc.com","cnbc.com","foxnews.com"
-}
-UNWANTED_KEYWORDS = [
-    "movie","film","trailer","episode","comic","manga","fan","fandom","imdb",
-    "review","tv series","fiction","novel","fantasy","screenplay","actor","actress"
-]
 def safe_headers():
-    return {"User-Agent": "misinfo-tool/1.0"}
 def domain_from_url(url: str) -> Optional[str]:
     if not url: return None
@@ -68,6 +60,98 @@ def domain_from_url(url: str) -> Optional[str]:
         return None
     return None
 def is_unwanted_snippet(snippet: str) -> bool:
     if not snippet: return False
     s = snippet.lower()
@@ -81,23 +165,28 @@ def filter_sources(sources: List[Dict[str,str]]) -> List[Dict[str,str]]:
         seen.add(url)
         domain = domain_from_url(url)
         s["domain"] = domain or ""
-        if domain in TRUSTED_DOMAINS:
-            kept.append(s)
-            continue
-        if domain and any(d in domain for d in ["imdb.com","youtube.com","wikipedia.org","fandom.com","comicbook.com"]):
-            continue
-        if is_unwanted_snippet(s.get("snippet","")) or is_unwanted_snippet(s.get("title","")):
-            continue
         kept.append(s)
     return kept
-def summarize_text(text: str, max_len=300) -> str:
-    sentences = re.split(r'(?<=[.!?]) +', text.strip())
-    summary = sentences[0] if sentences else text
-    if len(summary) > max_len:
-        summary = summary[:max_len].rsplit(' ',1)[0] + "..."
-    return summary
 def summarize_evidence(sources: List[Dict[str,str]], max_chars=800) -> str:
     if not sources:
         return "No credible news sources found."
@@ -112,170 +201,137 @@ def summarize_evidence(sources: List[Dict[str,str]], max_chars=800) -> str:
         return res[:max_chars].rsplit(" ",1)[0] + "..."
     return res
 def fuse_scores(fast_conf: float, deep_outcome: Optional[str], evidence_count: int) -> Dict[str,Any]:
     base = fast_conf*0.5 + min(evidence_count/5.0,1.0)*0.5
     if deep_outcome and deep_outcome.lower() in ["false","misleading"]:
         base *= 0.7
-    score = int(round(max(0,min(1,base))*100))
-    color = "green" if score>=70 else "yellow" if score>=40 else "red"
     return {"score":score, "color":color}
-# ---------------- Zero-shot classify ----------------
-def classify_text_type(text: str) -> Dict[str, Any]:
-    labels = ["news","rumor","fact","opinion","satire","unverifiable"]
-    try:
-        pipe = get_zs_pipe()
-        res = pipe(text, labels, multi_label=False, truncation=True)
-        label = res["labels"][0]
-        score = float(res["scores"][0])
-        return {"type": label, "score": round(score,3), "scores": dict(zip(res["labels"], res["scores"]))}
-    except Exception:
-        t = text.lower()
-        if any(k in t for k in ["according to","reported","breaking","news","announced"]):
-            return {"type":"news","score":0.65,"scores":{}}
-        if any(k in t for k in ["i think","in my opinion","i believe","should"]):
-            return {"type":"opinion","score":0.7,"scores":{}}
-        if any(k in t for k in ["joke","satire","not real","parody"]):
-            return {"type":"satire","score":0.7,"scores":{}}
-        if any(k in t for k in ["study shows","research","published","peer-reviewed"]):
-            return {"type":"fact","score":0.6,"scores":{}}
-        return {"type":"rumor","score":0.45,"scores":{}}
-# ---------------- Search functions ----------------
-def fetch_gnews(query: str, max_results=6) -> List[Dict[str,str]]:
-    if not GNEWS_API_KEY: return []
-    try:
-        url = "https://gnews.io/api/v4/search"
-        params = {"q": query, "token": GNEWS_API_KEY, "max": max_results, "lang":"en"}
-        r = requests.get(url, params=params, headers=safe_headers(), timeout=6)
-        r.raise_for_status()
-        js = r.json()
-        return [{"title": a.get("title"), "url": a.get("url"), "source": a.get("source",{}).get("name"), "snippet": a.get("description")} for a in js.get("articles", [])[:max_results]]
-    except: return []
-def fetch_newsapi(query: str, max_results=6) -> List[Dict[str,str]]:
-    if not NEWSORG_API_KEY: return []
     try:
-        url = "https://newsapi.org/v2/everything"
-        params = {"q": query, "pageSize": max_results, "apiKey": NEWSORG_API_KEY, "language":"en"}
         r = requests.get(url, params=params, headers=safe_headers(), timeout=6)
         r.raise_for_status()
         js = r.json()
-        return [{"title": a.get("title"), "url": a.get("url"), "source": a.get("source",{}).get("name"), "snippet": a.get("description")} for a in js.get("articles", [])[:max_results]]
-    except: return []
-def google_dork_search(query: str, max_results=6) -> List[Dict[str,str]]:
-    """Uses Google Custom Search API (Gemini)"""
-    if not GEMINI_API_KEY or not GEMINI_CX: return []
-    try:
-        url = "https://www.googleapis.com/customsearch/v1"
-        params = {"key": GEMINI_API_KEY, "cx": GEMINI_CX, "q": query, "num": max_results}
-        r = requests.get(url, params=params, timeout=6)
-        r.raise_for_status()
-        js = r.json()
-        items = js.get("items", [])
-        return [{"title": i.get("title"), "url": i.get("link"), "snippet": i.get("snippet"), "source": None} for i in items]
-    except: return []
-def duckduckgo_search(query: str, max_results=8) -> List[Dict[str,str]]:
-    try:
-        url = "https://html.duckduckgo.com/html/"
-        r = requests.post(url, data={"q": query}, headers=safe_headers(), timeout=6)
-        r.raise_for_status()
-        soup = BeautifulSoup(r.text, "html.parser")
         results = []
-        for res in soup.select(".result__a")[:max_results]:
-            title = res.get_text()
-            href = res.get("href")
-            snippet_node = res.find_parent().select_one(".result__snippet")
-            snippet = snippet_node.get_text() if snippet_node else ""
-            results.append({"title": title, "url": href, "source":None, "snippet": snippet})
-        return results
-    except: return []
-# ---------------- Main verification ----------------
-def verify_claim(text: str, mode: str="fast") -> Dict[str, Any]:
-    claim = text.strip()
     text_type_res = classify_text_type(claim)
     user_summary = summarize_text(claim)
-    # Step 1: Search all sources
-    all_raw = fetch_gnews(user_summary) + fetch_newsapi(user_summary) + google_dork_search(user_summary) + duckduckgo_search(user_summary)
     filtered = filter_sources(all_raw)
     evidence_summary = summarize_evidence(filtered)
-    # Step 2: Fast classification
-    fast_label, fast_conf = "Unverifiable", 0.4
-    try:
-        pipe = get_zs_pipe()
-        cls = pipe(claim, ["True","False","Misleading","Unverifiable"], multi_label=False, truncation=True)
-        fast_label = cls["labels"][0]
-        fast_conf = float(cls["scores"][0])
-    except: pass
-    # Step 3: Deep reasoning (placeholder)
-# Step 3: Deep reasoning
-deep_result = None
-if mode.lower() in ["deep","hybrid"]:
-    if GEMINI_CLIENT:
         try:
-            prompt = f"""
-            Verify the following claim: "{claim}".
-            Provide a JSON object with keys:
-            outcome (True/False/Unverifiable),
-            explanation,
-            comparison (list of claim-evidence pairs),
-            takeaways (list of advice).
-            """
-            response = GEMINI_CLIENT.models.generate_content(
-                model="gemini-2.5-flash",
-                contents=prompt
-            )
-            # Parse Gemini response as JSON
-            import json
-            deep_result = json.loads(response.text)
-        except Exception as e:
-            deep_result = {
-                "outcome":"Unverifiable",
-                "explanation": f"Gemini API error: {str(e)}",
-                "takeaways":["Search credible sources","Cross-check claims"]
-            }
-    else:
-        deep_result = {
-            "outcome":"Unverifiable",
-            "explanation":"Demo mode: Deep reasoning not configured (API key missing).",
-            "takeaways":["Search credible sources","Cross-check claims","Beware sensational headlines"]
-        }
-    # Step 4: Fuse score
     deep_outcome = deep_result.get("outcome") if deep_result else None
     fuse = fuse_scores(fast_conf, deep_outcome, len(filtered))
     return {
-        "Claim": claim,
-        "Text type": text_type_res["type"],
-        "Text type scores": text_type_res.get("scores", {}),
-        "User summary": user_summary,
-        "Fast classification": f"{fast_label} ({fast_conf:.2f})",
-        "Evidence count raw": len(all_raw),
-        "Evidence count filtered": len(filtered),
-        "Evidence summary": evidence_summary,
-        "Deep result": deep_result or "N/A",
-        "Credibility": fuse
     }
-# ---------------- FastAPI endpoint ----------------
-@app.post("/verify")
-async def verify_endpoint(payload: VerifyRequest):
-    return verify_claim(payload.text, payload.mode)
-# ---------------- Gradio UI ----------------
-iface = gr.Interface(
-    fn=verify_claim,
-    inputs=[gr.Textbox(label="Claim", lines=4), gr.Dropdown(["fast","deep","hybrid"], label="Mode")],
-    outputs=gr.JSON(label="Result"),
-    title="Hybrid Misinformation Detector"
-)
-# Mount Gradio inside FastAPI
-app = gr.mount_gradio_app(app, iface, path="/")  # UI at root

 import os
+import json
 import re
 from typing import List, Dict, Any, Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import HTMLResponse
 from pydantic import BaseModel
+from dotenv import load_dotenv
+import requests
 from bs4 import BeautifulSoup
+# NLP / AI
+try:
+    from sentence_transformers import SentenceTransformer, util
+    SENTE_MODEL = SentenceTransformer("all-mpnet-base-v2")
+except Exception:
+    SENTE_MODEL = None
+try:
+    from transformers import pipeline
+    ZS_PIPE = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+except Exception:
+    ZS_PIPE = None
+# Gemini AI
+try:
+    from google import genai
+    GEMINI_CLIENT = genai.Client()  # uses GEMINI_API_KEY from environment
+except Exception:
+    GEMINI_CLIENT = None
+load_dotenv()
+GNEWS_API_KEY = os.getenv("GNEWS_API_KEY")
+NEWSORG_API_KEY = os.getenv("NEWSORG_API_KEY")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+app = FastAPI(title="Hybrid Misinformation Detector")
+# ---------------- Models ----------------
 class VerifyRequest(BaseModel):
     text: str
+    mode: Optional[str] = "fast"  # fast, deep, hybrid
+# ---------------- Utilities ----------------
 def safe_headers():
+    return {"User-Agent": "misinfo-tool/1.0 (+https://example.com)"}
 def domain_from_url(url: str) -> Optional[str]:
     if not url: return None
         return None
     return None
+# ---------------- Trusted / Blacklist ----------------
+TRUSTED_DOMAINS = {
+    "bbc.co.uk","bbc.com","cnn.com","nytimes.com","reuters.com","apnews.com",
+    "theguardian.com","npr.org","washingtonpost.com","wsj.com","usatoday.com",
+    "bloomberg.com","aljazeera.com","msnbc.com","cnbc.com","foxnews.com",
+    "scientificamerican.com","nature.com","sciencedaily.com"
+}
+BLACKLISTED_DOMAINS = {
+    "imdb.com","youtube.com","wikipedia.org","fandom.com","comicbook.com",
+    "rottentomatoes.com","hulu.com","netflix.com","ign.com","forbes.com"
+}
+UNWANTED_KEYWORDS = [
+    "movie","film","episode","tv show","trailer","comic","manga","fan","fandom",
+    "review","fiction","novel","fantasy","screenplay","actor","actress"
+]
+# ---------------- NLP classify ----------------
+def classify_text_type(text: str) -> Dict[str, Any]:
+    labels = ["news","rumor","fact","opinion","satire","unverifiable"]
+    if ZS_PIPE:
+        try:
+            res = ZS_PIPE(text, labels, multi_label=False, truncation=True)
+            label = res["labels"][0]
+            score = float(res["scores"][0])
+            return {"type": label, "score": round(score,3), "scores": dict(zip(res["labels"], res["scores"]))}
+        except Exception:
+            pass
+    t = text.lower()
+    if any(k in t for k in ["according to","reported","breaking","news","announced"]):
+        return {"type":"news","score":0.65,"scores":{}}
+    if any(k in t for k in ["i think","in my opinion","i believe","should"]):
+        return {"type":"opinion","score":0.7,"scores":{}}
+    if any(k in t for k in ["joke","satire","not real","parody"]):
+        return {"type":"satire","score":0.7,"scores":{}}
+    if any(k in t for k in ["study shows","research","published","peer-reviewed"]):
+        return {"type":"fact","score":0.6,"scores":{}}
+    return {"type":"rumor","score":0.45,"scores":{}}
+def summarize_text(text: str, max_len=300) -> str:
+    sentences = re.split(r'(?<=[.!?]) +', text.strip())
+    summary = sentences[0] if sentences else text
+    if len(summary) > max_len:
+        summary = summary[:max_len].rsplit(' ',1)[0] + "..."
+    return summary
+# ---------------- Search ----------------
+def fetch_gnews(query: str, max_results=6) -> List[Dict[str,str]]:
+    if not GNEWS_API_KEY:
+        return []
+    try:
+        url = "https://gnews.io/api/v4/search"
+        params = {"q": query, "token": GNEWS_API_KEY, "max": max_results, "lang":"en"}
+        r = requests.get(url, params=params, headers=safe_headers(), timeout=6)
+        r.raise_for_status()
+        js = r.json()
+        return [{"title": a.get("title"), "url": a.get("url"), "source": a.get("source",{}).get("name"), "snippet": a.get("description")} for a in js.get("articles", [])[:max_results]]
+    except Exception:
+        return []
+def fetch_newsapi(query: str, max_results=6) -> List[Dict[str,str]]:
+    if not NEWSORG_API_KEY:
+        return []
+    try:
+        url = "https://newsapi.org/v2/everything"
+        params = {"q": query, "pageSize": max_results, "apiKey": NEWSORG_API_KEY, "language":"en"}
+        r = requests.get(url, params=params, headers=safe_headers(), timeout=6)
+        r.raise_for_status()
+        js = r.json()
+        return [{"title": a.get("title"), "url": a.get("url"), "source": a.get("source",{}).get("name"), "snippet": a.get("description")} for a in js.get("articles", [])[:max_results]]
+    except Exception:
+        return []
+def duckduckgo_search(query: str, max_results=8) -> List[Dict[str,str]]:
+    try:
+        url = "https://html.duckduckgo.com/html/"
+        r = requests.post(url, data={"q": query}, headers=safe_headers(), timeout=6)
+        r.raise_for_status()
+        soup = BeautifulSoup(r.text, "html.parser")
+        results = []
+        for res in soup.select(".result__a")[:max_results]:
+            title = res.get_text()
+            href = res.get("href")
+            snippet_node = res.find_parent().select_one(".result__snippet")
+            snippet = snippet_node.get_text() if snippet_node else ""
+            results.append({"title": title, "url": href, "source":None, "snippet": snippet})
+        return results
+    except Exception:
+        return []
+# ---------------- Filtering ----------------
 def is_unwanted_snippet(snippet: str) -> bool:
     if not snippet: return False
     s = snippet.lower()
         seen.add(url)
         domain = domain_from_url(url)
         s["domain"] = domain or ""
+        if not domain: continue
+        if domain in BLACKLISTED_DOMAINS: continue
+        if domain not in TRUSTED_DOMAINS: continue
+        if is_unwanted_snippet(s.get("snippet","")) or is_unwanted_snippet(s.get("title","")): continue
         kept.append(s)
     return kept
+def semantic_filter(claim: str, sources: List[Dict[str,str]], threshold=0.3):
+    if not SENTE_MODEL:
+        return sources
+    claim_emb = SENTE_MODEL.encode(claim, convert_to_tensor=True)
+    filtered = []
+    for s in sources:
+        snippet = s.get("snippet","")
+        if not snippet: continue
+        snippet_emb = SENTE_MODEL.encode(snippet, convert_to_tensor=True)
+        sim = util.cos_sim(claim_emb, snippet_emb).item()
+        if sim >= threshold:
+            filtered.append(s)
+    return filtered
+# ---------------- Evidence summary ----------------
 def summarize_evidence(sources: List[Dict[str,str]], max_chars=800) -> str:
     if not sources:
         return "No credible news sources found."
         return res[:max_chars].rsplit(" ",1)[0] + "..."
     return res
+# ---------------- Fusion ----------------
 def fuse_scores(fast_conf: float, deep_outcome: Optional[str], evidence_count: int) -> Dict[str,Any]:
     base = fast_conf*0.5 + min(evidence_count/5.0,1.0)*0.5
     if deep_outcome and deep_outcome.lower() in ["false","misleading"]:
         base *= 0.7
+    score = int(round(max(0, min(1, base)) * 100))
+    color = "green" if score >= 70 else "yellow" if score >= 40 else "red"
     return {"score":score, "color":color}
+# ---------------- Fact Check API (placeholder) ----------------
+def factcheck_claim(claim: str) -> Dict[str, Any]:
+    """
+    Query Google Fact Check Tools API to check the claim.
+    Requires GEMINI_API_KEY or your provided key in `GEMINI_API_KEY`.
+    """
+    api_key = "AIzaSyB0A-MIHs8qkjYTWE-TnoLw46KplX-Ihjs"  # your key
+    url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
+    params = {"query": claim, "key": api_key, "languageCode": "en", "pageSize": 5}
     try:
         r = requests.get(url, params=params, headers=safe_headers(), timeout=6)
         r.raise_for_status()
         js = r.json()
+        claims = js.get("claims", [])
         results = []
+        for c in claims:
+            claimant = c.get("claimant", "Unknown")
+            text = c.get("text", "")
+            claimReview = c.get("claimReview", [])
+            for review in claimReview:
+                publisher = review.get("publisher", {}).get("name")
+                url = review.get("url")
+                title = review.get("title")
+                review_rating = review.get("textualRating")
+                results.append({
+                    "claimant": claimant,
+                    "text": text,
+                    "publisher": publisher,
+                    "url": url,
+                    "title": title,
+                    "rating": review_rating
+                })
+        outcome = "Unverified" if not results else results[0].get("rating", "Unverified")
+        return {"outcome": outcome, "source": results}
+    except Exception as e:
+        return {"outcome": "Error", "source": [], "error": str(e)}
+# ---------------- API ----------------
+@app.post("/verify")
+async def verify(req: VerifyRequest):
+    claim = (req.text or "").strip()
+    mode = (req.mode or "fast").lower()
+    if not claim:
+        raise HTTPException(status_code=400, detail="Empty claim")
+    # Step 1 classify
     text_type_res = classify_text_type(claim)
+    stored_type = text_type_res["type"]
+    # Step 2 summarize
     user_summary = summarize_text(claim)
+    # Step 3 search
+    query = f"{user_summary} site:bbc.com OR site:cnn.com OR site:reuters.com OR site:apnews.com"
+    all_raw = fetch_gnews(query) + fetch_newsapi(query) + duckduckgo_search(query)
+    # Step 4 filter
     filtered = filter_sources(all_raw)
+    filtered = semantic_filter(claim, filtered)
     evidence_summary = summarize_evidence(filtered)
+    # Step 5 fast guess
+    fast_label = "Unverifiable"; fast_conf = 0.4
+    if ZS_PIPE:
         try:
+            cls = ZS_PIPE(claim, ["True","False","Misleading","Unverifiable"], multi_label=False, truncation=True)
+            fast_label = cls["labels"][0]
+            fast_conf = float(cls["scores"][0])
+        except Exception:
+            pass
+    # Step 6 deep (Gemini)
+    deep_result = None
+    if mode in ["deep","hybrid"]:
+        if GEMINI_CLIENT:
+            try:
+                prompt = f"""
+                Verify the following claim: "{claim}".
+                Provide JSON with keys: outcome, explanation, comparison (list), takeaways (list).
+                """
+                response = GEMINI_CLIENT.models.generate_content(
+                    model="gemini-2.5-flash",
+                    contents=prompt
+                )
+                deep_result = json.loads(response.text)
+            except Exception as e:
+                deep_result = {"outcome":"Unverifiable","explanation":f"Gemini API error: {str(e)}","takeaways":["Check credible sources"]}
+        else:
+            deep_result = {"outcome":"Unverifiable","explanation":"Demo mode: API missing","takeaways":["Check credible sources"]}
+    # Step 7 fact-check API
+    factcheck = factcheck_claim(claim)
+    # Step 8 fusion
     deep_outcome = deep_result.get("outcome") if deep_result else None
     fuse = fuse_scores(fast_conf, deep_outcome, len(filtered))
     return {
+        "claim": claim,
+        "text_type": stored_type,
+        "text_type_scores": text_type_res.get("scores", {}),
+        "user_summary": user_summary,
+        "fast": {"label": fast_label, "confidence": round(fast_conf,3)},
+        "evidence_count_raw": len(all_raw),
+        "evidence_count_filtered": len(filtered),
+        "evidence": filtered,
+        "evidence_summary": evidence_summary,
+        "deep": deep_result or {},
+        "factcheck": factcheck,
+        "credibility": fuse
     }
+# ---------------- Frontend ----------------
+@app.get("/", response_class=HTMLResponse)
+def root():
+    with open("static/index.html","r",encoding="utf-8") as f:
+        return f.read()
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=int(os.getenv("PORT","8000")), reload=True)