Spaces:

VictorM-Coder
/

AIDetector

Running

App Files Files Community

VictorM-Coder commited on 27 days ago

Commit

6cabfbd

verified ·

1 Parent(s): 72d2f9a

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -162

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import re
-import math
 import pandas as pd
 import gradio as gr
@@ -16,7 +15,7 @@ dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported(
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
 # -----------------------------
-# SENTENCE SPLITTER (no lookbehinds)
 # Protect → split → restore
 # -----------------------------
 ABBR = [
@@ -30,19 +29,10 @@ def _protect(text: str) -> str:
     t = text.strip()
     if not t:
         return ""
-    # Normalize newlines to spaces (Turnitin-like continuous flow)
-    t = re.sub(r"\s*\n+\s*", " ", t)
-    # Protect ellipses
-    t = t.replace("...", "⟨ELLIPSIS⟩")
-    # Protect decimals like 3.14
-    t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
-    # Protect known abbreviations' final dot
-    t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
     return t
 def _restore(text: str) -> str:
@@ -55,179 +45,72 @@ def sentence_split(text: str):
     t = _protect(text)
     if not t:
         return []
-    # Split on ., ?, ! followed by whitespace and then a plausible sentence starter
-    # (quote or capital or opening paren) OR end of string.
     parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)
-    # Rebuild sentences: regex split keeps the delimiter in alternating groups
-    sentences = []
-    buf = ""
     for i, chunk in enumerate(parts):
         if i % 2 == 0:
             buf += chunk
         else:
-            # chunk is the delimiter [.?!]
             buf += chunk
-            sentences.append(buf.strip())
-            buf = ""
     if buf.strip():
         sentences.append(buf.strip())
-    # Clean/restore
-    sentences = [_restore(s).strip() for s in sentences if s.strip()]
-    return sentences
 # -----------------------------
-# UTILITIES
 # -----------------------------
-def batched(iterable, n=64):
-    for i in range(0, len(iterable), n):
-        yield iterable[i:i+n], i
-def contig_spans(labels):
-    longest = 0
-    count = 0
-    run = 0
-    for lab in labels:
-        if lab == "AI":
-            run += 1
-            longest = max(longest, run)
-        else:
-            if run > 0:
-                count += 1
-            run = 0
-    if run > 0:
-        count += 1
-    return count, longest
-def verdict_from_stats(flag_pct, longest_span, avg_ai_prob):
-    if flag_pct >= 85 and longest_span >= 6 and avg_ai_prob >= 0.80:
-        return "⚠️ Highly likely AI-generated (long consecutive spans and high prevalence)."
-    if flag_pct >= 60 and longest_span >= 4:
-        return "⚠️ Strong AI signals (multiple/long spans)."
-    if flag_pct >= 30 or longest_span >= 3:
-        return "△ Some AI indicators (partial/short spans)."
-    return "✓ No clear AI indication (by this detector)."
-# -----------------------------
-# CORE CLASSIFIER
-# -----------------------------
-def classify_sentences(text, ai_threshold=0.70, batch_size=64, max_len=512):
     sents = sentence_split(text)
     if not sents:
-        return [], [], 0.0, 0.0, (0, 0)
-    all_probs = []
-    for chunk, _ in batched(sents, n=batch_size):
-        inputs = tokenizer(
-            chunk,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=max_len
-        ).to(device)
-        with torch.no_grad():
-            logits = model(**inputs).logits
-            probs = F.softmax(logits, dim=-1)  # [:,0]=Human, [:,1]=AI
-        all_probs.extend(probs[:, 1].detach().cpu().tolist())
-    labels = ["AI" if p >= ai_threshold else "Human" for p in all_probs]
-    avg_ai_prob = float(sum(all_probs) / len(all_probs))
-    flagged_pct = 100.0 * sum(1 for l in labels if l == "AI") / len(labels)
-    spans = contig_spans(labels)
     rows = []
-    for i, (s, p, lab) in enumerate(zip(sents, all_probs, labels), start=1):
-        rows.append({
-            "Sentence #": i,
-            "Sentence": s,
-            "AI Probability": round(p, 4),
-            "Label": lab
-        })
-    return sents, rows, avg_ai_prob, flagged_pct, spans
-# -----------------------------
-# HTML HIGHLIGHT
-# -----------------------------
-def color_for_prob(p):
-    if p < 0.30: return "#11823b"  # green
-    if p < 0.70: return "#b8860b"  # amber
-    return "#b80d0d"               # red
-def build_highlight_html(rows):
-    blocks = []
-    for r in rows:
-        p = r["AI Probability"]
-        col = color_for_prob(p)
-        pct = f"{p*100:.1f}%"
-        text = re.sub(r"\s+", " ", r["Sentence"]).strip()
-        blocks.append(
-            f"<span style='background:rgba(0,0,0,0.02); "
-            f"padding:4px 6px; border-radius:6px; display:block; margin:6px 0;'>"
-            f"<strong style='color:{col}'>[{pct} {r['Label']}]</strong> {text}</span>"
         )
-    return "\n".join(blocks)
-# -----------------------------
-# PUBLIC API FOR GRADIO
-# -----------------------------
-def generate_report(text, threshold):
-    if not text or not text.strip():
-        return "⚠️ Please enter some text.", None, None, None
-    sents, rows, avg_ai_prob, flagged_pct, (span_count, longest_span) = classify_sentences(
-        text, ai_threshold=threshold
-    )
-    verdict = verdict_from_stats(flagged_pct, longest_span, avg_ai_prob)
-    overall = (
-        f"⚖️ Turnitin-style Summary\n"
-        f"- Overall AI probability (avg per sentence): {avg_ai_prob*100:.1f}%\n"
-        f"- Sentences flagged as AI ≥ {int(threshold*100)}%: {flagged_pct:.1f}%\n"
-        f"- Consecutive AI spans: {span_count} (longest: {longest_span})\n"
-        f"- Verdict: {verdict}\n"
-        f"\nⓘ This is an approximation using an open detector; actual Turnitin results may differ."
-    )
-    html = build_highlight_html(rows)
-    df = pd.DataFrame(rows, columns=["Sentence #", "Sentence", "AI Probability", "Label"])
-    return overall, html, df, f"{flagged_pct:.1f}%"
 # -----------------------------
-# GRADIO UI
 # -----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## 🧭 Writenix AI Detector — Turnitin-style (Sentence-Level)")
-    with gr.Row():
-        text_input = gr.Textbox(
-            label="Paste your content",
-            lines=16,
-            placeholder="Drop your essay/article here…"
-        )
-    with gr.Row():
-        threshold = gr.Slider(
-            0.50, 0.95, value=0.70, step=0.01,
-            label="AI Flag Threshold (probability ≥ threshold ⇒ AI)"
-        )
-        detect_btn = gr.Button("🔎 Analyze")
-    with gr.Row():
-        ai_summary = gr.Textbox(label="Report Summary", lines=8)
-        flagged_pct = gr.Label(label="% Sentences Flagged (AI)")
-    highlighted = gr.HTML(label="Per-Sentence Highlights")
-    table = gr.Dataframe(headers=["Sentence #", "Sentence", "AI Probability", "Label"], wrap=True)
-    detect_btn.click(
-        fn=generate_report,
-        inputs=[text_input, threshold],
-        outputs=[ai_summary, highlighted, table, flagged_pct]
-    )
 if __name__ == "__main__":
     demo.launch()

 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import re
 import pandas as pd
 import gradio as gr
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
 # -----------------------------
+# SENTENCE SPLITTER (simple, robust, no lookbehinds)
 # Protect → split → restore
 # -----------------------------
 ABBR = [
     t = text.strip()
     if not t:
         return ""
+    t = re.sub(r"\s*\n+\s*", " ", t)            # normalize newlines
+    t = t.replace("...", "⟨ELLIPSIS⟩")          # ellipses
+    t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)  # decimals like 3.14
+    t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)       # abbreviations' dot
     return t
 def _restore(text: str) -> str:
     t = _protect(text)
     if not t:
         return []
+    # split on [.?!] followed by whitespace and likely sentence start or end
     parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)
+    sentences, buf = [], ""
     for i, chunk in enumerate(parts):
         if i % 2 == 0:
             buf += chunk
         else:
             buf += chunk
+            sentences.append(buf.strip()); buf = ""
     if buf.strip():
         sentences.append(buf.strip())
+    return [_restore(s).strip() for s in sentences if s.strip()]
 # -----------------------------
+# CLASSIFY SENTENCE-BY-SENTENCE
 # -----------------------------
+def classify_sentence_by_sentence(text, threshold=0.70, max_len=512):
     sents = sentence_split(text)
     if not sents:
+        return "⚠️ Please paste some text.", None, None
+    inputs = tokenizer(
+        sents, return_tensors="pt", padding=True, truncation=True, max_length=max_len
+    ).to(device)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+        probs = F.softmax(logits, dim=-1)  # [:,0]=Human, [:,1]=AI
     rows = []
+    highlights = []
+    for i, s in enumerate(sents, start=1):
+        ai_p = float(probs[i-1, 1].item())
+        label = "AI" if ai_p >= threshold else "Human"
+        pct = f"{ai_p*100:.1f}%"
+        # color
+        if ai_p < 0.30: color = "#11823b"      # green
+        elif ai_p < 0.70: color = "#b8860b"    # amber
+        else: color = "#b80d0d"                # red
+        highlights.append(
+            f"<div style='margin:6px 0; padding:6px 8px; border-radius:6px; background:rgba(0,0,0,0.03)'>"
+            f"<strong style='color:{color}'>[{pct} {label}]</strong> "
+            f"{re.sub(r'\\s+', ' ', s)}</div>"
         )
+        rows.append([i, s, round(ai_p, 4), label])
+    html = "\n".join(highlights)
+    df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
+    return "Done ✅ (sentence-by-sentence only)", html, df
 # -----------------------------
+# GRADIO UI (minimal)
 # -----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("### 🧠 Sentence-by-Sentence AI Check")
+    text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
+    threshold = gr.Slider(0.50, 0.95, value=0.70, step=0.01, label="AI threshold")
+    btn = gr.Button("Analyze")
+    status = gr.Label(label="Status")
+    highlights = gr.HTML(label="Per-Sentence Highlights")
+    table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob", "Label"], wrap=True)
+    btn.click(classify_sentence_by_sentence, inputs=[text_input, threshold],
+              outputs=[status, highlights, table])
 if __name__ == "__main__":
     demo.launch()