Spaces:

VictorM-Coder
/

AIDetector

Running

App Files Files Community

VictorM-Coder commited on Dec 2, 2025

Commit

70fc9f3

verified ·

1 Parent(s): 668274d

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -67

app.py CHANGED Viewed

@@ -14,141 +14,178 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
-# -----------------------------
-# THRESHOLD
-# -----------------------------
 THRESHOLD = 0.80
 # -----------------------------
-# SENTENCE SPLITTING
 # -----------------------------
-ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
-    "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", "u.s", "u.k",
-    "a.m", "p.m"
 ]
-ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
 def _protect(text):
-    t = text.replace("...", "⟨ELLIPSIS⟩")
-    t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
-    t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
-    return t
 def _restore(text):
-    return text.replace("⟨ABBRDOT⟩", ".").replace("⟨DECIMAL⟩", ".").replace("⟨ELLIPSIS⟩", "...")
-def split_sentences_preserving(text):
-    protected = _protect(text)
-    parts = re.split(r"([.?!])(\s+)", protected)
-    sentences = []
-    current = ""
-    for i in range(0, len(parts), 3):
-        part = parts[i]
-        punct = parts[i+1] if i+1 < len(parts) else ""
-        space = parts[i+2] if i+2 < len(parts) else ""
-        current = part + punct
-        sentences.append(_restore(current))
-        sentences.append(space)  # preserve exact spacing (spaces and newlines)
-    return sentences  # alternating [sentence, whitespace, sentence, whitespace...]
-def extract_pure_sentences(sent_block):
-    return [s for s in sent_block if not s.isspace()]
 # -----------------------------
-# GROUP SENTENCES
 # -----------------------------
 def group_sentences(sents, size=3):
-    return [" ".join(sents[i:i+size]) for i in range(0, len(sents), size)]
 # -----------------------------
-# MAIN ANALYSIS
 # -----------------------------
 def analyze(text, max_len=512):
-    # 1. Split while preserving structure
-    blocks = split_sentences_preserving(text)
-    pure_sentences = extract_pure_sentences(blocks)
     if not pure_sentences:
         return "—", "—", "<em>Paste text to analyze.</em>", None
-    # 2. Group for model
     grouped = group_sentences(pure_sentences, 3)
     clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
-    # 3. Run model
-    inputs = tokenizer(clean_grouped, return_tensors="pt", padding=True,
-                       truncation=True, max_length=max_len).to(device)
     with torch.no_grad():
         logits = model(**inputs).logits
         chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
-    # 4. Expand chunk scores to per-sentence
-    sentence_ai = []
     for idx, prob in enumerate(chunk_probs):
         start = idx * 3
         end = min(start + 3, len(pure_sentences))
         for _ in range(start, end):
-            sentence_ai.append(prob)
     # -----------------------------
-    # FINAL OUTPUT RECONSTRUCTION
     # -----------------------------
     highlighted = ""
-    sent_index = 0
     for block in blocks:
         if block.isspace():
-            highlighted += block  # preserve exact spacing
-        else:
-            # this block is a real sentence
-            ai_p = sentence_ai[sent_index]
-            sent_index += 1
-            pct = f"{ai_p*100:.1f}%"
-            if ai_p < 0.30:
-                color = "#11823b"
-            elif ai_p < 0.70:
-                color = "#b8860b"
-            else:
-                color = "#b80d0d"
-            highlighted += f"<span style='background-color:rgba(0,0,0,0.03); padding:3px 4px; border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> {block.strip()}</span> "
-    # Overall score
-    overall = sum(sentence_ai) / len(sentence_ai)
-    overall_pct = f"{overall*100:.1f}%"
     overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
     # Table output
     df = pd.DataFrame(
-        [[i+1, s, sentence_ai[i]] for i, s in enumerate(pure_sentences)],
         columns=["#", "Sentence", "AI_Prob"]
     )
     return overall_label, overall_pct, highlighted, df
 # -----------------------------
 # UI
 # -----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("### 🕵️ AI Sentence-Level Detector — Original Format Highlighting")
-    text_input = gr.Textbox(label="Paste text", lines=14)
     btn = gr.Button("Analyze")
-    verdict = gr.Label(label="Overall Verdict")
-    score = gr.Label(label="Overall AI Score")
-    highlights = gr.HTML(label="Highlighted Text (Original Format)")
     table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
     btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])

 dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
 THRESHOLD = 0.80
 # -----------------------------
+# ABBREVIATION PROTECTION
 # -----------------------------
+ABBR = [
+    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
+    "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
+    "u.s", "u.k", "a.m", "p.m"
 ]
+ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
 def _protect(text):
+    text = text.replace("...", "⟨ELLIPSIS⟩")
+    text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
+    text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
+    return text
 def _restore(text):
+    return (
+        text.replace("⟨ABBRDOT⟩", ".")
+            .replace("⟨DECIMAL⟩", ".")
+            .replace("⟨ELLIPSIS⟩", "...")
+    )
+# -----------------------------
+# PERFECT PARAGRAPH-PRESERVING SPLITTER
+# -----------------------------
+def split_preserving_structure(text):
+    """
+    Splits text into:
+    - EXACT newline blocks (\n, \n\n, etc.)
+    - Sentences inside non-newline blocks
+    """
+    blocks = re.split(r"(\n+)", text)  # keep newline separators
+    final_blocks = []
+    for block in blocks:
+        if block.startswith("\n"):
+            final_blocks.append(block)  # preserve EXACT paragraph spacing
+        else:
+            protected = _protect(block)
+            parts = re.split(r"([.?!])(\s+)", protected)
+            for i in range(0, len(parts), 3):
+                sentence = parts[i]
+                punct = parts[i+1] if i+1 < len(parts) else ""
+                space = parts[i+2] if i+2 < len(parts) else ""
+                whole = sentence + punct
+                if whole.strip():
+                    final_blocks.append(_restore(whole))
+                if space:
+                    final_blocks.append(space)
+    return final_blocks
+def extract_sentences_only(blocks):
+    """Return only sentence blocks (no whitespace/newlines)."""
+    return [
+        b for b in blocks
+        if b.strip() != "" and not b.startswith("\n") and not b.isspace()
+    ]
 # -----------------------------
+# GROUPING
 # -----------------------------
 def group_sentences(sents, size=3):
+    return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
 # -----------------------------
+# ANALYSIS LOGIC
 # -----------------------------
 def analyze(text, max_len=512):
+    # Structured block split
+    blocks = split_preserving_structure(text)
+    pure_sentences = extract_sentences_only(blocks)
     if not pure_sentences:
         return "—", "—", "<em>Paste text to analyze.</em>", None
+    # Group into 3-sentence windows (Turnitin style)
     grouped = group_sentences(pure_sentences, 3)
     clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
+    # Run model
+    inputs = tokenizer(clean_grouped, return_tensors="pt",
+                       padding=True, truncation=True,
+                       max_length=max_len).to(device)
     with torch.no_grad():
         logits = model(**inputs).logits
         chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
+    # Expand group scores back to individual sentences
+    ai_scores = []
     for idx, prob in enumerate(chunk_probs):
         start = idx * 3
         end = min(start + 3, len(pure_sentences))
         for _ in range(start, end):
+            ai_scores.append(prob)
     # -----------------------------
+    # RECONSTRUCT ORIGINAL TEXT W/ HIGHLIGHTING
     # -----------------------------
     highlighted = ""
+    current_sentence = 0
     for block in blocks:
+        # newline block → keep EXACT
+        if block.startswith("\n"):
+            highlighted += block
+            continue
+        # whitespace block → keep
         if block.isspace():
+            highlighted += block
+            continue
+        # real sentence → highlight
+        ai_p = ai_scores[current_sentence]
+        current_sentence += 1
+        pct = f"{ai_p * 100:.1f}%"
+        if ai_p < 0.30:
+            color = "#11823b"
+        elif ai_p < 0.70:
+            color = "#b8860b"
+        else:
+            color = "#b80d0d"
+        highlighted += (
+            f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
+            f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
+            f"{block.strip()}</span>"
+        )
+        # maintain spacing after sentence
+        highlighted += " "
+    # -----------------------------
+    # OVERALL SCORE
+    # -----------------------------
+    overall = sum(ai_scores) / len(ai_scores)
+    overall_pct = f"{overall * 100:.1f}%"
     overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
     # Table output
     df = pd.DataFrame(
+        [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
         columns=["#", "Sentence", "AI_Prob"]
     )
     return overall_label, overall_pct, highlighted, df
 # -----------------------------
 # UI
 # -----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")
+    text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
     btn = gr.Button("Analyze")
+    verdict = gr.Label(label="Verdict (Overall)")
+    score = gr.Label(label="AI Score")
+    highlights = gr.HTML(label="Highlighted Text (Exact Structure)")
     table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
     btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])