Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
from transformers import pipeline
|
| 3 |
-
import feedparser, requests, re,
|
| 4 |
|
| 5 |
# ---------------------------
|
| 6 |
# Load Models
|
|
@@ -31,46 +31,25 @@ RSS_FEEDS = {
|
|
| 31 |
|
| 32 |
GOOGLE_API_KEY = "AIzaSyAC56onKwR17zd_djUPEfGXQACy9qRjDxw"
|
| 33 |
GOOGLE_CX = "87391aed073954cae"
|
| 34 |
-
GOOGLE_DAILY_LIMIT = 100
|
| 35 |
-
QUOTA_FILE = "google_quota.json"
|
| 36 |
-
|
| 37 |
-
# ---------------------------
|
| 38 |
-
# Quota Management (UTC Reset)
|
| 39 |
-
# ---------------------------
|
| 40 |
-
def load_quota():
|
| 41 |
-
if os.path.exists(QUOTA_FILE):
|
| 42 |
-
with open(QUOTA_FILE, "r") as f:
|
| 43 |
-
data = json.load(f)
|
| 44 |
-
reset_time = datetime.datetime.fromisoformat(data["reset"])
|
| 45 |
-
if datetime.datetime.utcnow() > reset_time:
|
| 46 |
-
return {"count": 0, "reset": (datetime.datetime.utcnow() + datetime.timedelta(days=1)).isoformat()}
|
| 47 |
-
return data
|
| 48 |
-
return {"count": 0, "reset": (datetime.datetime.utcnow() + datetime.timedelta(days=1)).isoformat()}
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
def can_use_google():
|
| 55 |
-
quota = load_quota()
|
| 56 |
-
return quota["count"] < GOOGLE_DAILY_LIMIT
|
| 57 |
-
|
| 58 |
-
def increment_google_quota():
|
| 59 |
-
quota = load_quota()
|
| 60 |
-
quota["count"] += 1
|
| 61 |
-
save_quota(quota)
|
| 62 |
|
| 63 |
# ---------------------------
|
| 64 |
# Claim Extraction
|
| 65 |
# ---------------------------
|
| 66 |
def extract_claims(page_text):
|
| 67 |
-
"""Extract top 10 factual claims from page text."""
|
| 68 |
-
|
|
|
|
|
|
|
| 69 |
results = []
|
| 70 |
for s in sentences:
|
| 71 |
out = claim_classifier(s, claim_labels)
|
| 72 |
if out["labels"][0] == "factual claim":
|
| 73 |
results.append(s)
|
|
|
|
| 74 |
return results[:10]
|
| 75 |
|
| 76 |
# ---------------------------
|
|
@@ -83,29 +62,36 @@ def detect_ai(texts):
|
|
| 83 |
results = []
|
| 84 |
for t in texts:
|
| 85 |
out = ai_detector(t)
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
return results
|
| 88 |
|
| 89 |
# ---------------------------
|
| 90 |
# Evidence Gathering
|
| 91 |
# ---------------------------
|
| 92 |
-
def
|
| 93 |
-
|
| 94 |
-
results = []
|
| 95 |
for source, url in RSS_FEEDS.items():
|
| 96 |
try:
|
| 97 |
feed = feedparser.parse(url)
|
| 98 |
-
for entry in feed.entries[:
|
| 99 |
-
|
| 100 |
-
results.append(f"[{source}] {entry.title}: {entry.summary}")
|
| 101 |
-
if len(results) >= 2:
|
| 102 |
-
return results
|
| 103 |
except Exception:
|
| 104 |
continue
|
| 105 |
-
return
|
| 106 |
|
| 107 |
def fetch_wikipedia(claim):
|
| 108 |
-
"""Fetch Wikipedia summary (max 2 results)."""
|
| 109 |
try:
|
| 110 |
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(claim)}"
|
| 111 |
r = requests.get(url).json()
|
|
@@ -115,16 +101,22 @@ def fetch_wikipedia(claim):
|
|
| 115 |
return []
|
| 116 |
return []
|
| 117 |
|
| 118 |
-
def
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
return ["[Google] Daily quota reached (100 queries)."]
|
|
|
|
| 122 |
try:
|
| 123 |
url = f"https://www.googleapis.com/customsearch/v1?q={requests.utils.quote(claim)}&key={GOOGLE_API_KEY}&cx={GOOGLE_CX}"
|
| 124 |
r = requests.get(url).json()
|
| 125 |
-
|
| 126 |
items = r.get("items", [])
|
| 127 |
-
return [f"[Google] {item['title']}: {item['snippet']}" for item in items[:
|
| 128 |
except Exception:
|
| 129 |
return []
|
| 130 |
|
|
@@ -135,19 +127,16 @@ def fact_check(claims, evidence_texts, threshold=0.7):
|
|
| 135 |
results = []
|
| 136 |
for c in claims:
|
| 137 |
for ev in evidence_texts:
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
})
|
| 149 |
-
except Exception:
|
| 150 |
-
continue
|
| 151 |
return results
|
| 152 |
|
| 153 |
# ---------------------------
|
|
@@ -157,32 +146,32 @@ def predict(page_text=""):
|
|
| 157 |
"""
|
| 158 |
1. Extract claims from page_text
|
| 159 |
2. Run AI Detection
|
| 160 |
-
3. Gather evidence (
|
| 161 |
-
4. Fact-check claims against evidence
|
| 162 |
"""
|
|
|
|
| 163 |
claims = extract_claims(page_text) if page_text else []
|
|
|
|
|
|
|
| 164 |
ai_results = detect_ai(claims) if claims else []
|
| 165 |
|
| 166 |
-
|
|
|
|
| 167 |
for c in claims:
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
evidence_map[c].extend(fetch_rss(c))
|
| 172 |
-
|
| 173 |
-
evidence_texts = [ev for evs in evidence_map.values() for ev in evs]
|
| 174 |
-
|
| 175 |
-
fc_results = fact_check(claims, evidence_texts[:20]) if claims and evidence_texts else []
|
| 176 |
|
| 177 |
-
|
|
|
|
| 178 |
|
| 179 |
return {
|
| 180 |
"claims": claims,
|
| 181 |
"ai_detection": ai_results,
|
| 182 |
-
"google_quota_used":
|
| 183 |
-
"google_quota_reset":
|
| 184 |
-
"evidence_samples": {c:
|
| 185 |
-
"fact_checking": fc_results[:
|
| 186 |
}
|
| 187 |
|
| 188 |
# ---------------------------
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from transformers import pipeline
|
| 3 |
+
import feedparser, requests, re, time, datetime
|
| 4 |
|
| 5 |
# ---------------------------
|
| 6 |
# Load Models
|
|
|
|
| 31 |
|
| 32 |
GOOGLE_API_KEY = "AIzaSyAC56onKwR17zd_djUPEfGXQACy9qRjDxw"
|
| 33 |
GOOGLE_CX = "87391aed073954cae"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
# Google search quota tracking
|
| 36 |
+
google_quota = {"count": 0, "date": datetime.date.today()}
|
| 37 |
+
GOOGLE_DAILY_LIMIT = 100
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
# ---------------------------
|
| 40 |
# Claim Extraction
|
| 41 |
# ---------------------------
|
| 42 |
def extract_claims(page_text):
|
| 43 |
+
"""Extract top 10 factual claims from page text (split on ., , and ;)"""
|
| 44 |
+
chunks = re.split(r'[.,;]', page_text)
|
| 45 |
+
sentences = [s.strip() for s in chunks if len(s.strip()) > 5]
|
| 46 |
+
|
| 47 |
results = []
|
| 48 |
for s in sentences:
|
| 49 |
out = claim_classifier(s, claim_labels)
|
| 50 |
if out["labels"][0] == "factual claim":
|
| 51 |
results.append(s)
|
| 52 |
+
|
| 53 |
return results[:10]
|
| 54 |
|
| 55 |
# ---------------------------
|
|
|
|
| 62 |
results = []
|
| 63 |
for t in texts:
|
| 64 |
out = ai_detector(t)
|
| 65 |
+
raw_label = out[0]["label"]
|
| 66 |
+
|
| 67 |
+
# Map labels to friendlier ones
|
| 68 |
+
if raw_label.lower() in ["fake", "ai-generated"]:
|
| 69 |
+
label = "AI-generated"
|
| 70 |
+
else:
|
| 71 |
+
label = "Human"
|
| 72 |
+
|
| 73 |
+
results.append({
|
| 74 |
+
"text": t,
|
| 75 |
+
"label": label,
|
| 76 |
+
"score": round(out[0]["score"], 3)
|
| 77 |
+
})
|
| 78 |
return results
|
| 79 |
|
| 80 |
# ---------------------------
|
| 81 |
# Evidence Gathering
|
| 82 |
# ---------------------------
|
| 83 |
+
def fetch_rss_articles():
|
| 84 |
+
articles = []
|
|
|
|
| 85 |
for source, url in RSS_FEEDS.items():
|
| 86 |
try:
|
| 87 |
feed = feedparser.parse(url)
|
| 88 |
+
for entry in feed.entries[:5]:
|
| 89 |
+
articles.append(f"[{source}] {entry.title}: {entry.summary}")
|
|
|
|
|
|
|
|
|
|
| 90 |
except Exception:
|
| 91 |
continue
|
| 92 |
+
return articles
|
| 93 |
|
| 94 |
def fetch_wikipedia(claim):
|
|
|
|
| 95 |
try:
|
| 96 |
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(claim)}"
|
| 97 |
r = requests.get(url).json()
|
|
|
|
| 101 |
return []
|
| 102 |
return []
|
| 103 |
|
| 104 |
+
def fetch_google_search(claim):
|
| 105 |
+
global google_quota
|
| 106 |
+
today = datetime.date.today()
|
| 107 |
+
# reset quota daily
|
| 108 |
+
if google_quota["date"] != today:
|
| 109 |
+
google_quota = {"count": 0, "date": today}
|
| 110 |
+
|
| 111 |
+
if google_quota["count"] >= GOOGLE_DAILY_LIMIT:
|
| 112 |
return ["[Google] Daily quota reached (100 queries)."]
|
| 113 |
+
|
| 114 |
try:
|
| 115 |
url = f"https://www.googleapis.com/customsearch/v1?q={requests.utils.quote(claim)}&key={GOOGLE_API_KEY}&cx={GOOGLE_CX}"
|
| 116 |
r = requests.get(url).json()
|
| 117 |
+
google_quota["count"] += 1
|
| 118 |
items = r.get("items", [])
|
| 119 |
+
return [f"[Google] {item['title']}: {item['snippet']}" for item in items[:3]]
|
| 120 |
except Exception:
|
| 121 |
return []
|
| 122 |
|
|
|
|
| 127 |
results = []
|
| 128 |
for c in claims:
|
| 129 |
for ev in evidence_texts:
|
| 130 |
+
out = nli_pipeline(hypothesis=c, premise=ev)
|
| 131 |
+
label = out[0]["label"]
|
| 132 |
+
score = round(out[0]["score"], 3)
|
| 133 |
+
if score >= threshold:
|
| 134 |
+
results.append({
|
| 135 |
+
"claim": c,
|
| 136 |
+
"evidence": ev,
|
| 137 |
+
"label": label,
|
| 138 |
+
"score": score
|
| 139 |
+
})
|
|
|
|
|
|
|
|
|
|
| 140 |
return results
|
| 141 |
|
| 142 |
# ---------------------------
|
|
|
|
| 146 |
"""
|
| 147 |
1. Extract claims from page_text
|
| 148 |
2. Run AI Detection
|
| 149 |
+
3. Gather evidence (RSS + Wikipedia + Google, with quota)
|
| 150 |
+
4. Fact-check claims against evidence (only strong matches kept)
|
| 151 |
"""
|
| 152 |
+
# Step 1: Extract claims
|
| 153 |
claims = extract_claims(page_text) if page_text else []
|
| 154 |
+
|
| 155 |
+
# Step 2: AI detection
|
| 156 |
ai_results = detect_ai(claims) if claims else []
|
| 157 |
|
| 158 |
+
# Step 3: Evidence gathering
|
| 159 |
+
evidence_texts = []
|
| 160 |
for c in claims:
|
| 161 |
+
evidence_texts.extend(fetch_wikipedia(c))
|
| 162 |
+
evidence_texts.extend(fetch_google_search(c))
|
| 163 |
+
evidence_texts.extend(fetch_rss_articles())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
+
# Step 4: Fact-checking
|
| 166 |
+
fc_results = fact_check(claims, evidence_texts[:15]) if claims and evidence_texts else []
|
| 167 |
|
| 168 |
return {
|
| 169 |
"claims": claims,
|
| 170 |
"ai_detection": ai_results,
|
| 171 |
+
"google_quota_used": google_quota["count"],
|
| 172 |
+
"google_quota_reset": str(datetime.datetime.combine(google_quota["date"] + datetime.timedelta(days=1), datetime.time.min)),
|
| 173 |
+
"evidence_samples": {c: evidence_texts[:2] for c in claims[:2]}, # sample evidence
|
| 174 |
+
"fact_checking": fc_results[:10]
|
| 175 |
}
|
| 176 |
|
| 177 |
# ---------------------------
|