Spaces:

VictorM-Coder
/

AIDetector

Running

App Files Files Community

VictorM-Coder commited on Dec 3, 2025

Commit

fdd45e5

verified ·

1 Parent(s): 8d27116

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -32

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import gradio as gr
 MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
 THRESHOLD = 0.80
@@ -20,18 +20,21 @@ THRESHOLD = 0.80
 # ABBREVIATION PROTECTION
 # -----------------------------
 ABBR = [
-    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
-    "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
-    "u.s", "u.k", "a.m", "p.m"
 ]
 ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
 def _protect(text):
     text = text.replace("...", "⟨ELLIPSIS⟩")
     text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
     text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
     return text
 def _restore(text):
     return (
         text.replace("⟨ABBRDOT⟩", ".")
@@ -39,6 +42,7 @@ def _restore(text):
             .replace("⟨ELLIPSIS⟩", "...")
     )
 # -----------------------------
 # PERFECT PARAGRAPH-PRESERVING SPLITTER
 # -----------------------------
@@ -55,30 +59,32 @@ def split_preserving_structure(text):
             for i in range(0, len(parts), 3):
                 sentence = parts[i]
-                punct = parts[i+1] if i+1 < len(parts) else ""
-                space = parts[i+2] if i+2 < len(parts) else ""
                 whole = sentence + punct
                 if whole.strip():
                     final_blocks.append(_restore(whole))
                 if space:
                     final_blocks.append(space)
     return final_blocks
 def extract_sentences_only(blocks):
     return [
         b for b in blocks
         if b.strip() != "" and not b.startswith("\n") and not b.isspace()
     ]
 # -----------------------------
 # GROUPING
 # -----------------------------
 def group_sentences(sents, size=3):
     return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
 # -----------------------------
 # ANALYSIS LOGIC
 # -----------------------------
@@ -90,19 +96,21 @@ def analyze(text, max_len=512):
     if not pure_sentences:
         return "—", "—", "<em>Paste text to analyze.</em>", None
     grouped = group_sentences(pure_sentences, 3)
     clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
-    # Run model
     inputs = tokenizer(clean_grouped, return_tensors="pt",
                        padding=True, truncation=True,
                        max_length=max_len).to(device)
     with torch.no_grad():
         logits = model(**inputs).logits
-        chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
-    # Expand grouped probs to each sentence
     ai_scores = []
     for idx, prob in enumerate(chunk_probs):
         start = idx * 3
@@ -111,54 +119,49 @@ def analyze(text, max_len=512):
             ai_scores.append(prob)
     # -----------------------------
-    # COLOR HIGHLIGHTING (FULL SENTENCE BLOCK COLORING)
     # -----------------------------
     highlighted = ""
-    current_sentence = 0
     for block in blocks:
-        # newline blocks
         if block.startswith("\n"):
             highlighted += block
             continue
-        # whitespace blocks
         if block.isspace():
             highlighted += block
             continue
-        # real sentence
-        ai_p = ai_scores[current_sentence]
-        current_sentence += 1
         pct = f"{ai_p * 100:.1f}%"
-        # COLOR LEVELS (background + text)
         if ai_p < 0.30:
-            bg = "rgba(17,130,59,0.18)"      # green
-            color = "#0f5e2e"
         elif ai_p < 0.70:
-            bg = "rgba(184,134,11,0.23)"     # yellow
-            color = "#7a5f00"
         else:
-            bg = "rgba(184,13,13,0.20)"      # red
-            color = "#7a0000"
         highlighted += (
-            f"<span style='background:{bg}; padding:5px 8px; "
-            f"border-radius:6px; display:inline-block; margin-bottom:4px;'>"
-            f"<strong style='color:{color}'>[{pct}]</strong> "
             f"{block.strip()}</span> "
         )
     # -----------------------------
-    # OVERALL
     # -----------------------------
     overall = sum(ai_scores) / len(ai_scores)
     overall_pct = f"{overall * 100:.1f}%"
     overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
-    # Table
     df = pd.DataFrame(
         [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
         columns=["#", "Sentence", "AI_Prob"]
@@ -166,11 +169,12 @@ def analyze(text, max_len=512):
     return overall_label, overall_pct, highlighted, df
 # -----------------------------
-# GRADIO UI
 # -----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("### 🕵️ AI Sentence-Level Detector — Color Highlight Mode")
     text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
     btn = gr.Button("Analyze")

 MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
 THRESHOLD = 0.80
 # ABBREVIATION PROTECTION
 # -----------------------------
 ABBR = [
+    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc",
+    "fig", "al", "jr", "sr", "st", "no", "vol", "pp", "mt",
+    "inc", "ltd", "co", "u.s", "u.k", "a.m", "p.m"
 ]
 ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
 def _protect(text):
     text = text.replace("...", "⟨ELLIPSIS⟩")
     text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
     text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
     return text
 def _restore(text):
     return (
         text.replace("⟨ABBRDOT⟩", ".")
             .replace("⟨ELLIPSIS⟩", "...")
     )
 # -----------------------------
 # PERFECT PARAGRAPH-PRESERVING SPLITTER
 # -----------------------------
             for i in range(0, len(parts), 3):
                 sentence = parts[i]
+                punct = parts[i + 1] if i + 1 < len(parts) else ""
+                space = parts[i + 2] if i + 2 < len(parts) else ""
                 whole = sentence + punct
                 if whole.strip():
                     final_blocks.append(_restore(whole))
                 if space:
                     final_blocks.append(space)
     return final_blocks
 def extract_sentences_only(blocks):
     return [
         b for b in blocks
         if b.strip() != "" and not b.startswith("\n") and not b.isspace()
     ]
 # -----------------------------
 # GROUPING
 # -----------------------------
 def group_sentences(sents, size=3):
     return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
 # -----------------------------
 # ANALYSIS LOGIC
 # -----------------------------
     if not pure_sentences:
         return "—", "—", "<em>Paste text to analyze.</em>", None
+    # Group into 3-sentence windows
     grouped = group_sentences(pure_sentences, 3)
     clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
+    # Model forward pass
     inputs = tokenizer(clean_grouped, return_tensors="pt",
                        padding=True, truncation=True,
                        max_length=max_len).to(device)
     with torch.no_grad():
         logits = model(**inputs).logits
+    chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
+    # expand back
     ai_scores = []
     for idx, prob in enumerate(chunk_probs):
         start = idx * 3
             ai_scores.append(prob)
     # -----------------------------
+    # RECONSTRUCTION WITH HIGHLIGHT
     # -----------------------------
     highlighted = ""
+    sentence_index = 0
     for block in blocks:
         if block.startswith("\n"):
             highlighted += block
             continue
         if block.isspace():
             highlighted += block
             continue
+        # safety
+        if sentence_index >= len(ai_scores):
+            ai_p = ai_scores[-1]
+        else:
+            ai_p = ai_scores[sentence_index]
+        sentence_index += 1
         pct = f"{ai_p * 100:.1f}%"
         if ai_p < 0.30:
+            color = "#11823b"
         elif ai_p < 0.70:
+            color = "#b8860b"
         else:
+            color = "#b80d0d"
         highlighted += (
+            f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
+            f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
             f"{block.strip()}</span> "
         )
     # -----------------------------
+    # OVERALL SCORE
     # -----------------------------
     overall = sum(ai_scores) / len(ai_scores)
     overall_pct = f"{overall * 100:.1f}%"
     overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
     df = pd.DataFrame(
         [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
         columns=["#", "Sentence", "AI_Prob"]
     return overall_label, overall_pct, highlighted, df
 # -----------------------------
+# UI
 # -----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")
     text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
     btn = gr.Button("Analyze")