Spaces:

Mohammedmarzuk17
/

Edushield-AI-Backend

Sleeping

App Files Files Community

Mohammedmarzuk17 commited on Sep 18, 2025

Commit

70fee5e

verified ·

1 Parent(s): 7d40d48

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -50

app.py CHANGED Viewed

@@ -4,11 +4,17 @@ import requests, feedparser, time, threading, re, json, os
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from concurrent.futures import ThreadPoolExecutor
 # ---------------------------
-# Load Models
 # ---------------------------
 claim_model_name = "MoritzLaurer/DeBERTa-v3-base-mnli"
 claim_classifier = pipeline("zero-shot-classification", model=claim_model_name)
 claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
@@ -22,7 +28,6 @@ nli_pipeline = pipeline("text-classification", model=nli_model_name, tokenizer=n
 # ---------------------------
 # Evidence Sources
 # ---------------------------
 RSS_FEEDS = [
     "https://www.snopes.com/feed/",
     "https://www.politifact.com/rss/factchecks/",
@@ -38,7 +43,6 @@ CACHE_TTL = 60 * 60 * 3  # 3 hours
 # ---------------------------
 # Google Fact-Check API Setup
 # ---------------------------
 GOOGLE_API_KEY = "AIzaSyAC56onKwR17zd_djUPEfGXQACy9qRjDxw"
 GOOGLE_QUERY_LIMIT = 95
 COUNTER_FILE = "/tmp/google_fc_counter.json"
@@ -71,28 +75,31 @@ reset_daily_google_counter()
 def google_fact_check(claim):
     reset_daily_google_counter()
     if claim in google_cache:
-        return google_cache[claim]
-    if google_counter["count"] >= GOOGLE_QUERY_LIMIT:
-        return []
-    try:
-        url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?query={claim}&key={GOOGLE_API_KEY}"
-        resp = requests.get(url, timeout=5)
-        google_counter["count"] += 1
-        save_json_cache(COUNTER_FILE, google_counter)
-        if resp.status_code == 200:
-            results = resp.json().get("claims", [])
-            processed = [c.get("text", "")[:250]+"..." if len(c.get("text",""))>250 else c.get("text","") for c in results]
-            google_cache[claim] = processed
-            save_json_cache(GOOGLE_CACHE_FILE, google_cache)
-            return processed
-    except Exception as e:
-        print(f"Google Fact-Check API error: {e}")
-    return []
 # ---------------------------
 # Helpers
 # ---------------------------
 def clean_text(text):
     text = re.sub(r'<img.*?>', '', text)
     text = re.sub(r'<.*?>', '', text)
@@ -132,9 +139,40 @@ def start_rss_refresher():
     t.start()
 # ---------------------------
-# Semantic RSS Matching
 # ---------------------------
 def match_rss_semantic(claim, top_k=2):
     if not RSS_CACHE:
         return []
@@ -142,6 +180,9 @@ def match_rss_semantic(claim, top_k=2):
     texts = [a["summary"] for a in RSS_CACHE]
     titles = [a["title"] for a in RSS_CACHE]
     vectorizer = TfidfVectorizer(stop_words='english')
     tfidf_matrix = vectorizer.fit_transform([claim] + texts)
     cosine_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
@@ -150,7 +191,7 @@ def match_rss_semantic(claim, top_k=2):
     matched = []
     matched_titles = []
     for i in top_indices:
-        if cosine_scores[i] > 0.1:
             matched.append(texts[i])
             matched_titles.append(titles[i])
@@ -163,31 +204,9 @@ def match_rss_semantic(claim, top_k=2):
     return matched
-# ---------------------------
-# Wikipedia Summary
-# ---------------------------
-def get_wikipedia_summary(query):
-    summary = ""
-    try:
-        url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{query.replace(' ', '_')}"
-        resp = requests.get(url, timeout=5)
-        if resp.status_code == 200:
-            summary = clean_text(resp.json().get("extract", ""))
-    except Exception:
-        pass
-    if summary:
-        print(f"\nClaim: {query}\nWikipedia Summary: {summary[:300]}...")
-    else:
-        print(f"\nClaim: {query}\nNo Wikipedia summary found.")
-    return summary
 # ---------------------------
 # Claim Extraction
 # ---------------------------
 def extract_claims(page_text):
     sentences = re.split(r'(?<=[.!?;\n])\s+', page_text) if page_text else []
     results, seen = [], set()
@@ -207,7 +226,6 @@ def extract_claims(page_text):
 # ---------------------------
 # AI Detection
 # ---------------------------
 def detect_ai(texts):
     if isinstance(texts, str):
         texts = [texts]
@@ -222,7 +240,6 @@ def detect_ai(texts):
 # ---------------------------
 # Fact-Checking with Threaded NLI + Google
 # ---------------------------
 def process_evidence_pair(claim, evidence):
     key = f"{claim}||{evidence}"
     if key in nli_cache:
@@ -292,7 +309,6 @@ def fact_check_with_sources(claims):
 # ---------------------------
 # Predict
 # ---------------------------
 def predict(page_text=""):
     claims = extract_claims(page_text)
     ai_results = detect_ai(claims) if claims else []
@@ -306,7 +322,6 @@ def predict(page_text=""):
 # ---------------------------
 # Gradio UI
 # ---------------------------
 with gr.Blocks() as demo:
     gr.Markdown("## EduShield AI Backend - Predict API & UI")
     with gr.Tab("Predict"):
@@ -328,7 +343,6 @@ with gr.Blocks() as demo:
 # ---------------------------
 # Launch
 # ---------------------------
 if __name__ == "__main__":
     refresh_rss_cache(force=True)
     start_rss_refresher()

 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from concurrent.futures import ThreadPoolExecutor
+import nltk
 # ---------------------------
+# NLTK setup for keyword extraction
 # ---------------------------
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+# ---------------------------
+# Load Models
+# ---------------------------
 claim_model_name = "MoritzLaurer/DeBERTa-v3-base-mnli"
 claim_classifier = pipeline("zero-shot-classification", model=claim_model_name)
 claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
 # ---------------------------
 # Evidence Sources
 # ---------------------------
 RSS_FEEDS = [
     "https://www.snopes.com/feed/",
     "https://www.politifact.com/rss/factchecks/",
 # ---------------------------
 # Google Fact-Check API Setup
 # ---------------------------
 GOOGLE_API_KEY = "AIzaSyAC56onKwR17zd_djUPEfGXQACy9qRjDxw"
 GOOGLE_QUERY_LIMIT = 95
 COUNTER_FILE = "/tmp/google_fc_counter.json"
 def google_fact_check(claim):
     reset_daily_google_counter()
     if claim in google_cache:
+        hits = google_cache[claim]
+    elif google_counter["count"] >= GOOGLE_QUERY_LIMIT:
+        hits = []
+    else:
+        hits = []
+        try:
+            url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?query={claim}&key={GOOGLE_API_KEY}"
+            resp = requests.get(url, timeout=5)
+            google_counter["count"] += 1
+            save_json_cache(COUNTER_FILE, google_counter)
+            if resp.status_code == 200:
+                results = resp.json().get("claims", [])
+                hits = [c.get("text", "")[:250]+"..." if len(c.get("text",""))>250 else c.get("text","") for c in results]
+        except Exception as e:
+            print(f"Google Fact-Check API error: {e}")
+        google_cache[claim] = hits
+        save_json_cache(GOOGLE_CACHE_FILE, google_cache)
+    print(f"\nClaim: {claim}\nGoogle Fact-Check Hits: {hits if hits else 'None'}")
+    return hits
 # ---------------------------
 # Helpers
 # ---------------------------
 def clean_text(text):
     text = re.sub(r'<img.*?>', '', text)
     text = re.sub(r'<.*?>', '', text)
     t.start()
 # ---------------------------
+# Keyword Extraction
+# ---------------------------
+def extract_keywords(sentence):
+    words = nltk.word_tokenize(sentence)
+    pos_tags = nltk.pos_tag(words)
+    keywords = [w for w, pos in pos_tags if pos.startswith('NN') or pos.startswith('JJ')]
+    return keywords[:5] if keywords else words[:5]
+# ---------------------------
+# Wikipedia Summary
 # ---------------------------
+def get_wikipedia_summary(query):
+    summary = ""
+    keywords = extract_keywords(query)
+    search_variants = ['_'.join(keywords), query.replace(' ', '_')]
+    for variant in search_variants:
+        try:
+            url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{variant}"
+            resp = requests.get(url, timeout=5)
+            if resp.status_code == 200:
+                summary = clean_text(resp.json().get("extract", ""))
+                if summary:
+                    break
+        except Exception:
+            continue
+    if summary:
+        print(f"\nClaim: {query}\nWikipedia Summary: {summary[:300]}...")
+    else:
+        print(f"\nClaim: {query}\nNo Wikipedia summary found.")
+    return summary
+# ---------------------------
+# RSS Semantic + Keyword Matching
+# ---------------------------
 def match_rss_semantic(claim, top_k=2):
     if not RSS_CACHE:
         return []
     texts = [a["summary"] for a in RSS_CACHE]
     titles = [a["title"] for a in RSS_CACHE]
+    claim_keywords = extract_keywords(claim)
+    keyword_pattern = '|'.join(claim_keywords).lower()
     vectorizer = TfidfVectorizer(stop_words='english')
     tfidf_matrix = vectorizer.fit_transform([claim] + texts)
     cosine_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
     matched = []
     matched_titles = []
     for i in top_indices:
+        if cosine_scores[i] > 0.1 or any(k in texts[i].lower() for k in claim_keywords):
             matched.append(texts[i])
             matched_titles.append(titles[i])
     return matched
 # ---------------------------
 # Claim Extraction
 # ---------------------------
 def extract_claims(page_text):
     sentences = re.split(r'(?<=[.!?;\n])\s+', page_text) if page_text else []
     results, seen = [], set()
 # ---------------------------
 # AI Detection
 # ---------------------------
 def detect_ai(texts):
     if isinstance(texts, str):
         texts = [texts]
 # ---------------------------
 # Fact-Checking with Threaded NLI + Google
 # ---------------------------
 def process_evidence_pair(claim, evidence):
     key = f"{claim}||{evidence}"
     if key in nli_cache:
 # ---------------------------
 # Predict
 # ---------------------------
 def predict(page_text=""):
     claims = extract_claims(page_text)
     ai_results = detect_ai(claims) if claims else []
 # ---------------------------
 # Gradio UI
 # ---------------------------
 with gr.Blocks() as demo:
     gr.Markdown("## EduShield AI Backend - Predict API & UI")
     with gr.Tab("Predict"):
 # ---------------------------
 # Launch
 # ---------------------------
 if __name__ == "__main__":
     refresh_rss_cache(force=True)
     start_rss_refresher()