Spaces:

VictorM-Coder
/

AIDetector

Running

App Files Files Community

VictorM-Coder commited on Dec 2, 2025

Commit

668274d

verified ·

1 Parent(s): 96ab1a6

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -104

app.py CHANGED Viewed

@@ -14,178 +14,141 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
 THRESHOLD = 0.80
 # -----------------------------
-# ABBREVIATION PROTECTION
 # -----------------------------
-ABBR = [
-    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
-    "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
-    "u.s", "u.k", "a.m", "p.m"
 ]
-ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
 def _protect(text):
-    text = text.replace("...", "⟨ELLIPSIS⟩")
-    text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
-    text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
-    return text
 def _restore(text):
-    return (
-        text.replace("⟨ABBRDOT⟩", ".")
-            .replace("⟨DECIMAL⟩", ".")
-            .replace("⟨ELLIPSIS⟩", "...")
-    )
-# -----------------------------
-# PERFECT PARAGRAPH-PRESERVING SPLITTER
-# -----------------------------
-def split_preserving_structure(text):
-    """
-    Splits text into:
-    - EXACT newline blocks (\n, \n\n, etc.)
-    - Sentences inside non-newline blocks
-    """
-    blocks = re.split(r"(\n+)", text)  # keep newline separators
-    final_blocks = []
-    for block in blocks:
-        if block.startswith("\n"):
-            final_blocks.append(block)  # preserve EXACT paragraph spacing
-        else:
-            protected = _protect(block)
-            parts = re.split(r"([.?!])(\s+)", protected)
-            for i in range(0, len(parts), 3):
-                sentence = parts[i]
-                punct = parts[i+1] if i+1 < len(parts) else ""
-                space = parts[i+2] if i+2 < len(parts) else ""
-                whole = sentence + punct
-                if whole.strip():
-                    final_blocks.append(_restore(whole))
-                if space:
-                    final_blocks.append(space)
-    return final_blocks
-def extract_sentences_only(blocks):
-    """Return only sentence blocks (no whitespace/newlines)."""
-    return [
-        b for b in blocks
-        if b.strip() != "" and not b.startswith("\n") and not b.isspace()
-    ]
 # -----------------------------
-# GROUPING
 # -----------------------------
 def group_sentences(sents, size=3):
-    return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
 # -----------------------------
-# ANALYSIS LOGIC
 # -----------------------------
 def analyze(text, max_len=512):
-    # Structured block split
-    blocks = split_preserving_structure(text)
-    pure_sentences = extract_sentences_only(blocks)
     if not pure_sentences:
         return "—", "—", "<em>Paste text to analyze.</em>", None
-    # Group into 3-sentence windows (Turnitin style)
     grouped = group_sentences(pure_sentences, 3)
     clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
-    # Run model
-    inputs = tokenizer(clean_grouped, return_tensors="pt",
-                       padding=True, truncation=True,
-                       max_length=max_len).to(device)
     with torch.no_grad():
         logits = model(**inputs).logits
         chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
-    # Expand group scores back to individual sentences
-    ai_scores = []
     for idx, prob in enumerate(chunk_probs):
         start = idx * 3
         end = min(start + 3, len(pure_sentences))
         for _ in range(start, end):
-            ai_scores.append(prob)
     # -----------------------------
-    # RECONSTRUCT ORIGINAL TEXT W/ HIGHLIGHTING
     # -----------------------------
     highlighted = ""
-    current_sentence = 0
     for block in blocks:
-        # newline block → keep EXACT
-        if block.startswith("\n"):
-            highlighted += block
-            continue
-        # whitespace block → keep
         if block.isspace():
-            highlighted += block
-            continue
-        # real sentence → highlight
-        ai_p = ai_scores[current_sentence]
-        current_sentence += 1
-        pct = f"{ai_p * 100:.1f}%"
-        if ai_p < 0.30:
-            color = "#11823b"
-        elif ai_p < 0.70:
-            color = "#b8860b"
         else:
-            color = "#b80d0d"
-        highlighted += (
-            f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
-            f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
-            f"{block.strip()}</span>"
-        )
-        # maintain spacing after sentence
-        highlighted += " "
-    # -----------------------------
-    # OVERALL SCORE
-    # -----------------------------
-    overall = sum(ai_scores) / len(ai_scores)
-    overall_pct = f"{overall * 100:.1f}%"
     overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
     # Table output
     df = pd.DataFrame(
-        [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
         columns=["#", "Sentence", "AI_Prob"]
     )
     return overall_label, overall_pct, highlighted, df
 # -----------------------------
 # UI
 # -----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")
-    text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
     btn = gr.Button("Analyze")
-    verdict = gr.Label(label="Verdict (Overall)")
-    score = gr.Label(label="AI Score")
-    highlights = gr.HTML(label="Highlighted Text (Exact Structure)")
     table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
     btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])

 dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
+# -----------------------------
+# THRESHOLD
+# -----------------------------
 THRESHOLD = 0.80
 # -----------------------------
+# SENTENCE SPLITTING
 # -----------------------------
+ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
+    "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", "u.s", "u.k",
+    "a.m", "p.m"
 ]
+ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
 def _protect(text):
+    t = text.replace("...", "⟨ELLIPSIS⟩")
+    t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
+    t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
+    return t
 def _restore(text):
+    return text.replace("⟨ABBRDOT⟩", ".").replace("⟨DECIMAL⟩", ".").replace("⟨ELLIPSIS⟩", "...")
+def split_sentences_preserving(text):
+    protected = _protect(text)
+    parts = re.split(r"([.?!])(\s+)", protected)
+    sentences = []
+    current = ""
+    for i in range(0, len(parts), 3):
+        part = parts[i]
+        punct = parts[i+1] if i+1 < len(parts) else ""
+        space = parts[i+2] if i+2 < len(parts) else ""
+        current = part + punct
+        sentences.append(_restore(current))
+        sentences.append(space)  # preserve exact spacing (spaces and newlines)
+    return sentences  # alternating [sentence, whitespace, sentence, whitespace...]
+def extract_pure_sentences(sent_block):
+    return [s for s in sent_block if not s.isspace()]
 # -----------------------------
+# GROUP SENTENCES
 # -----------------------------
 def group_sentences(sents, size=3):
+    return [" ".join(sents[i:i+size]) for i in range(0, len(sents), size)]
 # -----------------------------
+# MAIN ANALYSIS
 # -----------------------------
 def analyze(text, max_len=512):
+    # 1. Split while preserving structure
+    blocks = split_sentences_preserving(text)
+    pure_sentences = extract_pure_sentences(blocks)
     if not pure_sentences:
         return "—", "—", "<em>Paste text to analyze.</em>", None
+    # 2. Group for model
     grouped = group_sentences(pure_sentences, 3)
     clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
+    # 3. Run model
+    inputs = tokenizer(clean_grouped, return_tensors="pt", padding=True,
+                       truncation=True, max_length=max_len).to(device)
     with torch.no_grad():
         logits = model(**inputs).logits
         chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
+    # 4. Expand chunk scores to per-sentence
+    sentence_ai = []
     for idx, prob in enumerate(chunk_probs):
         start = idx * 3
         end = min(start + 3, len(pure_sentences))
         for _ in range(start, end):
+            sentence_ai.append(prob)
     # -----------------------------
+    # FINAL OUTPUT RECONSTRUCTION
     # -----------------------------
     highlighted = ""
+    sent_index = 0
     for block in blocks:
         if block.isspace():
+            highlighted += block  # preserve exact spacing
         else:
+            # this block is a real sentence
+            ai_p = sentence_ai[sent_index]
+            sent_index += 1
+            pct = f"{ai_p*100:.1f}%"
+            if ai_p < 0.30:
+                color = "#11823b"
+            elif ai_p < 0.70:
+                color = "#b8860b"
+            else:
+                color = "#b80d0d"
+            highlighted += f"<span style='background-color:rgba(0,0,0,0.03); padding:3px 4px; border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> {block.strip()}</span> "
+    # Overall score
+    overall = sum(sentence_ai) / len(sentence_ai)
+    overall_pct = f"{overall*100:.1f}%"
     overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
     # Table output
     df = pd.DataFrame(
+        [[i+1, s, sentence_ai[i]] for i, s in enumerate(pure_sentences)],
         columns=["#", "Sentence", "AI_Prob"]
     )
     return overall_label, overall_pct, highlighted, df
 # -----------------------------
 # UI
 # -----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("### 🕵️ AI Sentence-Level Detector — Original Format Highlighting")
+    text_input = gr.Textbox(label="Paste text", lines=14)
     btn = gr.Button("Analyze")
+    verdict = gr.Label(label="Overall Verdict")
+    score = gr.Label(label="Overall AI Score")
+    highlights = gr.HTML(label="Highlighted Text (Original Format)")
     table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
     btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])