Spaces:

Mohammedmarzuk17
/

Edushield-AI-Backend

Sleeping

App Files Files Community

Mohammedmarzuk17 commited on Oct 16, 2025

Commit

5e7b159

verified ·

1 Parent(s): d043e29

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -41

app.py CHANGED Viewed

@@ -6,10 +6,12 @@ from concurrent.futures import ThreadPoolExecutor
 # ---------------------------
 # Load Models
 # ---------------------------
 claim_model_name = "MoritzLaurer/DeBERTa-v3-base-mnli"
 claim_classifier = pipeline("zero-shot-classification", model=claim_model_name, device=-1)
 claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
 ai_detect_model_name = "roberta-base-openai-detector"
 ai_detector = pipeline("text-classification", model=ai_detect_model_name, device=-1)
@@ -23,67 +25,66 @@ google_quota = {"count": 0, "date": datetime.date.today()}
 GOOGLE_DAILY_LIMIT = 100
 # ---------------------------
-# Text Splitting (Sentence-based)
 # ---------------------------
-def split_sentences(text):
-    """Split text by period into complete sentences."""
-    sentences = [s.strip() for s in text.split('.') if s.strip()]
-    return [s + '.' for s in sentences]
 # ---------------------------
 # Claim Extraction
 # ---------------------------
-def extract_claims(page_text, max_claims=20):
     """
-    Extract sentences as claims (not filtered).
-    - Each sentence from user input is treated as a claim.
-    - Classify each sentence with DeBERTa model for factuality.
     """
-    sentences = split_sentences(page_text)
     def classify_sentence(s):
-        try:
-            out = claim_classifier(s, claim_labels)
-            label_priority = ["factual claim", "opinion", "personal anecdote"]
-            for lbl in label_priority:
-                if lbl in out["labels"]:
-                    return {"text": s, "label": lbl, "score": round(out["scores"][out["labels"].index(lbl)], 3)}
-        except Exception:
-            pass
-        return {"text": s, "label": "unknown", "score": 0.0}
     results = []
     with ThreadPoolExecutor() as executor:
         for r in executor.map(classify_sentence, sentences):
-            results.append(r)
-    # Keep full sentences, limit only if too long
-    results = results[:max_claims]
     return results
 # ---------------------------
 # AI Text Detection
 # ---------------------------
 def detect_ai(texts):
-    """Detect AI-generated or human-written content."""
     if isinstance(texts, str):
         texts = [texts]
     results = []
     for t in texts:
-        try:
-            out = ai_detector(t)
-            raw_label = out[0]["label"]
-            label = "AI-generated" if raw_label.lower() in ["fake", "ai-generated"] else "Human"
-            results.append({"text": t, "label": label, "score": round(out[0]["score"], 3)})
-        except Exception:
-            results.append({"text": t, "label": "error", "score": 0.0})
     return results
 # ---------------------------
-# Google Search Fact Checking
 # ---------------------------
 def fetch_google_search(claim):
-    """Fetch top 3 Google results for a claim."""
     global google_quota
     today = datetime.date.today()
     if google_quota["date"] != today:
@@ -97,7 +98,7 @@ def fetch_google_search(claim):
         r = requests.get(url).json()
         google_quota["count"] += 1
         items = r.get("items", [])
-        return [f"{item['title']}: {item['snippet']}" for item in items[:3]]
     except Exception:
         return []
@@ -107,19 +108,20 @@ def fetch_google_search(claim):
 def predict(user_text=""):
     """
     Runs both:
-    1. Full-text analysis (AI detection + single fact-check)
-    2. Claim-based analysis (sentence split + AI detection + fact-check)
     """
     if not user_text.strip():
         return {"error": "No text provided."}
-    # --- Full Text Analysis ---
     full_ai_result = detect_ai(user_text)
-    # ✅ FIX: Now fact-check entire input once
-    full_fact_checking = {"Full text": fetch_google_search(user_text)}
-    # --- Claim-based Analysis ---
     claims_data = extract_claims(user_text)
     claims_texts = [c["text"] for c in claims_data]
     claims_ai_results = detect_ai(claims_texts) if claims_texts else []
@@ -129,7 +131,7 @@ def predict(user_text=""):
         "full_text": {
             "input": user_text,
             "ai_detection": full_ai_result,
-            "fact_checking": full_fact_checking  # full input fact-checked once
         },
         "claims": claims_data,
         "claims_ai_detection": claims_ai_results,

 # ---------------------------
 # Load Models
 # ---------------------------
+# Claim Extraction → Zero-Shot Classifier (DeBERTa MNLI)
 claim_model_name = "MoritzLaurer/DeBERTa-v3-base-mnli"
 claim_classifier = pipeline("zero-shot-classification", model=claim_model_name, device=-1)
 claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
+# AI Text Detection → OpenAI Detector (Roberta-based)
 ai_detect_model_name = "roberta-base-openai-detector"
 ai_detector = pipeline("text-classification", model=ai_detect_model_name, device=-1)
 GOOGLE_DAILY_LIMIT = 100
 # ---------------------------
+# Safe Split Helpers
 # ---------------------------
+def safe_split_text(text):
+    """
+    Split text safely on '.' or ',' or ';'
+    but do NOT split when between numbers (e.g., 1.41, 1,200, $1,200).
+    """
+    pattern = r'(?<!\d)[.](?!\d)|(?<![\d\$]),(?!\d)|;'
+    return [s.strip() for s in re.split(pattern, text) if len(s.strip().split()) > 4]
 # ---------------------------
 # Claim Extraction
 # ---------------------------
+def extract_claims(page_text, max_claims=20, batch_size=50):
     """
+    Extract top claims from text:
+    - Uses safe_split_text for splitting.
+    - Classifies each piece into factual claim, opinion, or anecdote.
     """
+    sentences = safe_split_text(page_text)
+    # Step 1: Function to classify a single sentence
     def classify_sentence(s):
+        out = claim_classifier(s, claim_labels)
+        label_priority = ["factual claim", "opinion", "personal anecdote"]
+        for lbl in label_priority:
+            if lbl in out["labels"]:
+                return {"text": s, "label": lbl, "score": round(out["scores"][out["labels"].index(lbl)], 3)}
+        return None
+    # Step 2: Threaded classification
     results = []
     with ThreadPoolExecutor() as executor:
         for r in executor.map(classify_sentence, sentences):
+            if r:
+                results.append(r)
+    # Step 3: Limit top claims
+    results = sorted(results, key=lambda x: -len(x["text"]))[:max_claims]
     return results
 # ---------------------------
 # AI Text Detection
 # ---------------------------
 def detect_ai(texts):
+    """Detect whether input text is AI-generated or human-written."""
     if isinstance(texts, str):
         texts = [texts]
     results = []
     for t in texts:
+        out = ai_detector(t)
+        raw_label = out[0]["label"]
+        label = "AI-generated" if raw_label.lower() in ["fake", "ai-generated"] else "Human"
+        results.append({"text": t, "label": label, "score": round(out[0]["score"], 3)})
     return results
 # ---------------------------
+# Google Evidence Gathering
 # ---------------------------
 def fetch_google_search(claim):
     global google_quota
     today = datetime.date.today()
     if google_quota["date"] != today:
         r = requests.get(url).json()
         google_quota["count"] += 1
         items = r.get("items", [])
+        return [f"{item['title']}: {item['snippet']}" for item in items[:3]]  # top 3 results
     except Exception:
         return []
 def predict(user_text=""):
     """
     Runs both:
+    1. Full-text analysis (AI detection on entire input + sentence-based fact-check)
+    2. Claim-extracted analysis (claim split + AI detection + fact-check)
     """
     if not user_text.strip():
         return {"error": "No text provided."}
+    # --- Full text analysis ---
     full_ai_result = detect_ai(user_text)
+    # NEW: Split strictly by '.' to preserve full user input sentences
+    dot_sentences = [s.strip() for s in user_text.split('.') if s.strip()]
+    full_fact_checking = {s: fetch_google_search(s) for s in dot_sentences}
+    # --- Claim-based analysis ---
     claims_data = extract_claims(user_text)
     claims_texts = [c["text"] for c in claims_data]
     claims_ai_results = detect_ai(claims_texts) if claims_texts else []
         "full_text": {
             "input": user_text,
             "ai_detection": full_ai_result,
+            "fact_checking": full_fact_checking
         },
         "claims": claims_data,
         "claims_ai_detection": claims_ai_results,