Spaces:

Mohammedmarzuk17
/

Edushield-AI-Backend

Sleeping

App Files Files Community

Mohammedmarzuk17 commited on Dec 13, 2025

Commit

5f2e5ba

verified ·

1 Parent(s): f9fb094

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -178

app.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import gradio as gr
-from transformers import pipeline, AutoTokenizer, AutoModel
-import requests, re, datetime, torch
 from concurrent.futures import ThreadPoolExecutor
-import torch.nn.functional as F
 # ---------------------------
 # Load Models
 # ---------------------------
-# Claim Extraction → Zero-Shot Classifier (DeBERTa MNLI)
 claim_model_name = "MoritzLaurer/DeBERTa-v3-base-mnli"
 claim_classifier = pipeline(
     "zero-shot-classification",
@@ -17,7 +17,7 @@ claim_classifier = pipeline(
 )
 claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
-# AI Text Detection → OpenAI Detector (Roberta-based)
 ai_detect_model_name = "roberta-base-openai-detector"
 ai_detector = pipeline(
     "text-classification",
@@ -25,28 +25,9 @@ ai_detector = pipeline(
     device=-1
 )
-# ---------------------------
-# ✅ Semantic Model (EmbeddingGemma-300M)
-# ---------------------------
 SEM_MODEL_NAME = "google/embeddinggemma-300m"
-sem_tokenizer = AutoTokenizer.from_pretrained(SEM_MODEL_NAME)
-sem_model = AutoModel.from_pretrained(SEM_MODEL_NAME)
-sem_model.eval()
-def embed_texts(texts):
-    """Generate normalized sentence embeddings"""
-    with torch.no_grad():
-        inputs = sem_tokenizer(
-            texts,
-            padding=True,
-            truncation=True,
-            return_tensors="pt"
-        )
-        outputs = sem_model(**inputs)
-        embeddings = outputs.last_hidden_state.mean(dim=1)
-        embeddings = F.normalize(embeddings, p=2, dim=1)
-    return embeddings
 # ---------------------------
 # Google Search Config
@@ -58,190 +39,113 @@ google_quota = {"count": 0, "date": datetime.date.today()}
 GOOGLE_DAILY_LIMIT = 100
 # ---------------------------
-# Safe Split Helpers
 # ---------------------------
 def safe_split_text(text):
-    """
-    Split text safely on '.' or ',' or ';'
-    but do NOT split when between numbers (e.g., 1.41, 1,200).
-    """
-    pattern = r'(?<!\d)[.](?!\d)|(?<![\d\$]),(?!\d)|;'
-    return [
-        s.strip()
-        for s in re.split(pattern, text)
-        if len(s.strip().split()) > 4
-    ]
 # ---------------------------
 # Claim Extraction
 # ---------------------------
-def extract_claims(page_text, max_claims=20):
-    sentences = safe_split_text(page_text)
-    def classify_sentence(s):
         out = claim_classifier(s, claim_labels)
-        label_priority = ["factual claim", "opinion", "personal anecdote"]
-        for lbl in label_priority:
-            if lbl in out["labels"]:
-                return {
-                    "text": s,
-                    "label": lbl,
-                    "score": round(
-                        out["scores"][out["labels"].index(lbl)], 3
-                    )
-                }
-        return None
-    results = []
-    with ThreadPoolExecutor() as executor:
-        for r in executor.map(classify_sentence, sentences):
-            if r:
-                results.append(r)
-    results = sorted(results, key=lambda x: -len(x["text"]))[:max_claims]
-    return results
 # ---------------------------
-# AI Text Detection
 # ---------------------------
 def detect_ai(texts):
     if isinstance(texts, str):
         texts = [texts]
-    results = []
     for t in texts:
-        out = ai_detector(t)
-        raw_label = out[0]["label"]
-        label = "AI-generated" if raw_label.lower() in ["fake", "ai-generated"] else "Human"
-        results.append({
-            "text": t,
-            "label": label,
-            "score": round(out[0]["score"], 3)
-        })
-    return results
 # ---------------------------
-# Google Evidence Gathering
-# (Keyword + Semantic Ranking)
 # ---------------------------
-def fetch_google_search(claim, num_results=8):
     global google_quota
-    today = datetime.date.today()
-    if google_quota["date"] != today:
-        google_quota = {"count": 0, "date": today}
-    if google_quota["count"] >= GOOGLE_DAILY_LIMIT:
-        return {
-            "keyword_results": ["[Google] Daily quota reached."],
-            "semantic_results": ["[Google] Daily quota reached."]
-        }
-    try:
-        url = (
-            "https://www.googleapis.com/customsearch/v1"
-            f"?q={requests.utils.quote(claim)}"
-            f"&key={GOOGLE_API_KEY}"
-            f"&cx={GOOGLE_CX}"
-            f"&num={num_results}"
-        )
-        r = requests.get(url).json()
-        google_quota["count"] += 1
-        items = r.get("items", [])
-        snippets = [
-            f"{item['title']}: {item['snippet']}"
-            for item in items
-        ]
-        # Keyword results (original behavior)
-        keyword_results = snippets[:3]
-        # Semantic ranking
-        if snippets:
-            claim_emb = embed_texts([claim])
-            snippet_embs = embed_texts(snippets)
-            sims = torch.matmul(claim_emb, snippet_embs.T)[0]
-            top_idx = torch.argsort(sims, descending=True)[:3]
-            semantic_results = [snippets[i] for i in top_idx]
-        else:
-            semantic_results = []
-        return {
-            "keyword_results": keyword_results,
-            "semantic_results": semantic_results
-        }
-    except Exception:
-        return {
-            "keyword_results": [],
-            "semantic_results": []
-        }
-# ---------------------------
-# Unified Predict Function
-# ---------------------------
-def predict(user_text=""):
-    if not user_text.strip():
-        return {"error": "No text provided."}
-    # --- Full text analysis ---
-    full_ai_result = detect_ai(user_text)
-    dot_sentences = [
-        s.strip() for s in user_text.split('.') if s.strip()
-    ]
-    full_fact_checking = {
-        s: fetch_google_search(s) for s in dot_sentences
-    }
-    # --- Claim-based analysis ---
-    claims_data = extract_claims(user_text)
-    claims_texts = [c["text"] for c in claims_data]
-    claims_ai_results = detect_ai(claims_texts) if claims_texts else []
-    claims_fact_checking = {
-        c["text"]: fetch_google_search(c["text"])
-        for c in claims_data
-    }
     return {
-        "full_text": {
-            "input": user_text,
-            "ai_detection": full_ai_result,
-            "fact_checking": full_fact_checking
-        },
-        "claims": claims_data,
-        "claims_ai_detection": claims_ai_results,
-        "claims_fact_checking": claims_fact_checking,
-        "google_quota_used": google_quota["count"],
-        "google_quota_reset": str(
-            datetime.datetime.combine(
-                google_quota["date"] + datetime.timedelta(days=1),
-                datetime.time.min
-            )
-        )
     }
 # ---------------------------
-# Gradio UI
 # ---------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("## EduShield AI Backend – Keyword + Semantic Fact Checking")
-    page_text_input = gr.Textbox(
-        label="Input Text",
-        lines=10,
-        placeholder="Paste text here..."
-    )
-    predict_btn = gr.Button("Run Predict")
-    output_json = gr.JSON(label="Predict Results")
-    predict_btn.click(
-        predict,
-        inputs=[page_text_input],
-        outputs=output_json
-    )
 # ---------------------------
-# Launch
 # ---------------------------
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0")

 import gradio as gr
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer, util
+import requests, re, datetime
 from concurrent.futures import ThreadPoolExecutor
 # ---------------------------
 # Load Models
 # ---------------------------
+# Claim Extraction → Zero-Shot Classifier
 claim_model_name = "MoritzLaurer/DeBERTa-v3-base-mnli"
 claim_classifier = pipeline(
     "zero-shot-classification",
 )
 claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
+# AI Text Detection
 ai_detect_model_name = "roberta-base-openai-detector"
 ai_detector = pipeline(
     "text-classification",
     device=-1
 )
+# ✅ Semantic Model (CORRECT way for EmbeddingGemma)
 SEM_MODEL_NAME = "google/embeddinggemma-300m"
+sem_model = SentenceTransformer(SEM_MODEL_NAME)
 # ---------------------------
 # Google Search Config
 GOOGLE_DAILY_LIMIT = 100
 # ---------------------------
+# Helpers
 # ---------------------------
 def safe_split_text(text):
+    pattern = r'(?<!\d)[.](?!\d)'
+    return [s.strip() for s in re.split(pattern, text) if len(s.strip()) > 10]
 # ---------------------------
 # Claim Extraction
 # ---------------------------
+def extract_claims(text, max_claims=20):
+    sentences = safe_split_text(text)
+    def classify(s):
         out = claim_classifier(s, claim_labels)
+        lbl = out["labels"][0]
+        score = round(out["scores"][0], 3)
+        return {"text": s, "label": lbl, "score": score}
+    with ThreadPoolExecutor() as ex:
+        results = list(ex.map(classify, sentences))
+    return results[:max_claims]
 # ---------------------------
+# AI Detection
 # ---------------------------
 def detect_ai(texts):
     if isinstance(texts, str):
         texts = [texts]
+    out = []
     for t in texts:
+        r = ai_detector(t)[0]
+        label = "AI-generated" if r["label"].lower() in ["fake", "ai-generated"] else "Human"
+        out.append({"text": t, "label": label, "score": round(r["score"], 3)})
+    return out
 # ---------------------------
+# Google + Semantic Fact Check
 # ---------------------------
+def fetch_google_search_semantic(claim, k=3):
     global google_quota
+    if google_quota["count"] >= GOOGLE_DAILY_LIMIT:
+        return {"keyword": [], "semantic": []}
+    url = (
+        "https://www.googleapis.com/customsearch/v1"
+        f"?q={requests.utils.quote(claim)}"
+        f"&key={GOOGLE_API_KEY}&cx={GOOGLE_CX}&num=10"
+    )
+    r = requests.get(url).json()
+    google_quota["count"] += 1
+    items = r.get("items", [])
+    snippets = [f"{i['title']}: {i['snippet']}" for i in items]
+    keyword_results = snippets[:k]
+    if not snippets:
+        return {"keyword": keyword_results, "semantic": []}
+    q_emb = sem_model.encode(claim, normalize_embeddings=True)
+    s_emb = sem_model.encode(snippets, normalize_embeddings=True)
+    sims = util.cos_sim(q_emb, s_emb)[0]
+    top_idx = sims.argsort(descending=True)[:k]
+    semantic_results = [snippets[i] for i in top_idx]
     return {
+        "keyword": keyword_results,
+        "semantic": semantic_results
     }
 # ---------------------------
+# Predict
 # ---------------------------
+def predict(text=""):
+    if not text.strip():
+        return {"error": "No input"}
+    full_ai = detect_ai(text)
+    sentences = safe_split_text(text)
+    full_fc = {s: fetch_google_search_semantic(s) for s in sentences}
+    claims = extract_claims(text)
+    claim_ai = detect_ai([c["text"] for c in claims])
+    claim_fc = {c["text"]: fetch_google_search_semantic(c["text"]) for c in claims}
+    return {
+        "full_text": {
+            "input": text,
+            "ai_detection": full_ai,
+            "fact_checking": full_fc
+        },
+        "claims": claims,
+        "claims_ai_detection": claim_ai,
+        "claims_fact_checking": claim_fc
+    }
 # ---------------------------
+# UI
 # ---------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("## EduShield AI Backend – Keyword + Semantic Fact Check")
+    inp = gr.Textbox(lines=8, label="Input Text")
+    btn = gr.Button("Run Analysis")
+    out = gr.JSON()
+    btn.click(predict, inp, out)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0")