Spaces:

VictorM-Coder
/

AIDetector

Running

App Files Files Community

VictorM-Coder commited on Dec 2, 2025

Commit

26af59c

verified ·

1 Parent(s): 21a21f1

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -132

app.py CHANGED Viewed

@@ -15,193 +15,140 @@ dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported(
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
 # -----------------------------
-# AI DECISION THRESHOLD (80%)
 # -----------------------------
 THRESHOLD = 0.80
 # -----------------------------
-# SENTENCE SPLITTING UTILITIES
 # -----------------------------
-ABBR = [
-    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
-    "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
-    "u.s", "u.k", "a.m", "p.m"
 ]
-ABBR_REGEX = re.compile(
-    r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.",
-    flags=re.IGNORECASE
-)
 def _protect(text):
-    t = text.strip()
-    if not t:
-        return ""
-    t = re.sub(r"\s*\n+\s*", " ", t)
-    t = t.replace("...", "⟨ELLIPSIS⟩")
     t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
     t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
     return t
 def _restore(text):
-    return (
-        text.replace("⟨ABBRDOT⟩", ".")
-            .replace("⟨DECIMAL⟩", ".")
-            .replace("⟨ELLIPSIS⟩", "...")
-    )
-def sentence_split(text):
-    t = _protect(text)
-    if not t:
-        return []
-    parts = re.split(
-        r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t
-    )
-    sentences, buf = [], ""
-    for i, chunk in enumerate(parts):
-        if i % 2 == 0:
-            buf += chunk
-        else:
-            buf += chunk
-            sentences.append(buf.strip())
-            buf = ""
-    if buf.strip():
-        sentences.append(buf.strip())
-    return [_restore(s).strip() for s in sentences if s.strip()]
-# -----------------------------
-# PARAGRAPH UTILITIES
-# -----------------------------
-def split_paragraphs(text):
-    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
-    return paragraphs
-def map_sentences_to_paragraphs(paragraphs):
-    all_sentences = []
-    mapping = []
-    for p_idx, para in enumerate(paragraphs):
-        sents = sentence_split(para)
-        for s_idx, s in enumerate(sents):
-            all_sentences.append(s)
-            mapping.append((p_idx, s_idx))
-    return all_sentences, mapping
-def combine_paragraph_scores(paragraphs, mapping, sentence_probs):
-    bucket = [[] for _ in paragraphs]
-    for (p_idx, _), prob in zip(mapping, sentence_probs):
-        bucket[p_idx].append(prob)
-    final_scores = [
-        (sum(scores) / len(scores)) if scores else 0
-        for scores in bucket
-    ]
-    return final_scores
 # -----------------------------
-# GROUP SENTENCES (TURNITIN STYLE)
 # -----------------------------
 def group_sentences(sents, size=3):
-    return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
 # -----------------------------
-# CORE ANALYSIS
 # -----------------------------
 def analyze(text, max_len=512):
-    paragraphs = split_paragraphs(text)
-    if not paragraphs:
-        return "—", "—", "<em>Paste some text to analyze.</em>", None
-    # map paragraphs → sentences
-    sents, mapping = map_sentences_to_paragraphs(paragraphs)
-    # group sentences in 3s
-    grouped = group_sentences(sents, 3)
     clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
-    # tokenize chunks
-    inputs = tokenizer(
-        clean_grouped,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=max_len
-    ).to(device)
     with torch.no_grad():
         logits = model(**inputs).logits
         chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
-    # expand chunk probability to each sentence
-    ai_probs = []
     for idx, prob in enumerate(chunk_probs):
         start = idx * 3
-        end = min(start + 3, len(sents))
         for _ in range(start, end):
-            ai_probs.append(prob)
-    # final paragraph-level scores
-    paragraph_ai = combine_paragraph_scores(paragraphs, mapping, ai_probs)
-    # overall score
-    overall = sum(ai_probs) / len(ai_probs)
-    overall_pct = f"{overall * 100:.1f}%"
-    overall_label = (
-        "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
-    )
-    # paragraph-based HTML output
-    final_html = ""
-    for idx, (para, ai) in enumerate(zip(paragraphs, paragraph_ai), start=1):
-        pct = f"{ai * 100:.1f}%"
-        label = "AI" if ai >= THRESHOLD else "Human"
-        # color
-        if ai < 0.30:
-            color = "#11823b"
-        elif ai < 0.70:
-            color = "#b8860b"
         else:
-            color = "#b80d0d"
-        final_html += f"""
-        <div style='margin:12px 0; padding:12px; border-radius:8px; background:#fafafa'>
-            <strong style='color:{color}'>[Paragraph {idx}: {pct} {label}]</strong>
-            <div style='margin-top:8px; white-space:pre-wrap'>{para}</div>
-        </div>
-        """
-    # sentence table (still available if needed)
-    rows = []
-    for i, s in enumerate(sents, start=1):
-        rows.append([i, s, round(ai_probs[i-1], 4)])
-    df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob"])
-    return overall_label, overall_pct, final_html, df
 # -----------------------------
-# GRADIO UI
 # -----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model (Turnitin-style Paragraph Mode)")
-    text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
     btn = gr.Button("Analyze")
     verdict = gr.Label(label="Overall Verdict")
     score = gr.Label(label="Overall AI Score")
-    highlights = gr.HTML(label="Paragraph Highlights (Original Format)")
     table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
     btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])

 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
 # -----------------------------
+# THRESHOLD
 # -----------------------------
 THRESHOLD = 0.80
 # -----------------------------
+# SENTENCE SPLITTING
 # -----------------------------
+ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
+    "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", "u.s", "u.k",
+    "a.m", "p.m"
 ]
+ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
 def _protect(text):
+    t = text.replace("...", "⟨ELLIPSIS⟩")
     t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
     t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
     return t
 def _restore(text):
+    return text.replace("⟨ABBRDOT⟩", ".").replace("⟨DECIMAL⟩", ".").replace("⟨ELLIPSIS⟩", "...")
+def split_sentences_preserving(text):
+    protected = _protect(text)
+    parts = re.split(r"([.?!])(\s+)", protected)
+    sentences = []
+    current = ""
+    for i in range(0, len(parts), 3):
+        part = parts[i]
+        punct = parts[i+1] if i+1 < len(parts) else ""
+        space = parts[i+2] if i+2 < len(parts) else ""
+        current = part + punct
+        sentences.append(_restore(current))
+        sentences.append(space)  # preserve exact spacing (spaces and newlines)
+    return sentences  # alternating [sentence, whitespace, sentence, whitespace...]
+def extract_pure_sentences(sent_block):
+    return [s for s in sent_block if not s.isspace()]
 # -----------------------------
+# GROUP SENTENCES
 # -----------------------------
 def group_sentences(sents, size=3):
+    return [" ".join(sents[i:i+size]) for i in range(0, len(sents), size)]
 # -----------------------------
+# MAIN ANALYSIS
 # -----------------------------
 def analyze(text, max_len=512):
+    # 1. Split while preserving structure
+    blocks = split_sentences_preserving(text)
+    pure_sentences = extract_pure_sentences(blocks)
+    if not pure_sentences:
+        return "—", "—", "<em>Paste text to analyze.</em>", None
+    # 2. Group for model
+    grouped = group_sentences(pure_sentences, 3)
     clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
+    # 3. Run model
+    inputs = tokenizer(clean_grouped, return_tensors="pt", padding=True,
+                       truncation=True, max_length=max_len).to(device)
     with torch.no_grad():
         logits = model(**inputs).logits
         chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
+    # 4. Expand chunk scores to per-sentence
+    sentence_ai = []
     for idx, prob in enumerate(chunk_probs):
         start = idx * 3
+        end = min(start + 3, len(pure_sentences))
         for _ in range(start, end):
+            sentence_ai.append(prob)
+    # -----------------------------
+    # FINAL OUTPUT RECONSTRUCTION
+    # -----------------------------
+    highlighted = ""
+    sent_index = 0
+    for block in blocks:
+        if block.isspace():
+            highlighted += block  # preserve exact spacing
         else:
+            # this block is a real sentence
+            ai_p = sentence_ai[sent_index]
+            sent_index += 1
+            pct = f"{ai_p*100:.1f}%"
+            if ai_p < 0.30:
+                color = "#11823b"
+            elif ai_p < 0.70:
+                color = "#b8860b"
+            else:
+                color = "#b80d0d"
+            highlighted += f"<span style='background-color:rgba(0,0,0,0.03); padding:3px 4px; border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> {block.strip()}</span> "
+    # Overall score
+    overall = sum(sentence_ai) / len(sentence_ai)
+    overall_pct = f"{overall*100:.1f}%"
+    overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
+    # Table output
+    df = pd.DataFrame(
+        [[i+1, s, sentence_ai[i]] for i, s in enumerate(pure_sentences)],
+        columns=["#", "Sentence", "AI_Prob"]
+    )
+    return overall_label, overall_pct, highlighted, df
 # -----------------------------
+# UI
 # -----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("### 🕵️ AI Sentence-Level Detector — Original Format Highlighting")
+    text_input = gr.Textbox(label="Paste text", lines=14)
     btn = gr.Button("Analyze")
     verdict = gr.Label(label="Overall Verdict")
     score = gr.Label(label="Overall AI Score")
+    highlights = gr.HTML(label="Highlighted Text (Original Format)")
     table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
     btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])