Spaces:

VictorM-Coder
/

AIDetector

Running

App Files Files Community

VictorM-Coder commited on 27 days ago

Commit

72d2f9a

verified ·

1 Parent(s): 0d83dcd

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -58

app.py CHANGED Viewed

@@ -13,45 +13,70 @@ MODEL_NAME = "openai-community/roberta-base-openai-detector"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
-model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, torch_dtype=dtype).to(device).eval()
 # -----------------------------
-# SENTENCE SPLITTER (robust, no externals)
 # -----------------------------
-_ABBR = r"(?:e\.g|i\.e|mr|mrs|ms|dr|prof|vs|etc|fig|al|jr|sr|st|no|vol|pp|mt|inc|ltd|co|u\.s|u\.k|a\.m|p\.m)\."
-_QUOTE = r"[\"“”‘’']?"
-# Split on ., ?, ! when followed by space/newline + a capital/quote or end of text,
-# while avoiding common abbreviations and decimals.
-_SENT_PAT = re.compile(
-    rf"""
-    (?<!\b{_ABBR})           # not common abbreviation
-    (?<!\d)\.|\?|!           # ., ?, !
-    (?=\s+{_QUOTE}[A-Z(]|$)  # lookahead for next sentence start or end
-    """,
-    re.VERBOSE
-)
 def sentence_split(text: str):
-    # Normalize hard breaks to spaces (Turnitin-like continuous flow)
-    t = re.sub(r"\s*\n+\s*", " ", text.strip())
     if not t:
         return []
-    # Temporarily protect ellipses to avoid over-splitting
-    t = t.replace("...", "…")
-    pieces = []
-    start = 0
-    for m in _SENT_PAT.finditer(t):
-        end = m.end()
-        chunk = t[start:end].strip()
-        if chunk:
-            pieces.append(chunk)
-        start = end
-    # tail
-    tail = t[start:].strip()
-    if tail:
-        pieces.append(tail)
-    # Restore ellipses
-    return [s.replace("…", "...") for s in pieces]
 # -----------------------------
 # UTILITIES
@@ -61,7 +86,6 @@ def batched(iterable, n=64):
         yield iterable[i:i+n], i
 def contig_spans(labels):
-    """Return (num_spans, longest_span_len) for consecutive 'AI' labels."""
     longest = 0
     count = 0
     run = 0
@@ -78,10 +102,6 @@ def contig_spans(labels):
     return count, longest
 def verdict_from_stats(flag_pct, longest_span, avg_ai_prob):
-    """
-    Turnitin-ish qualitative summary.
-    - Emphasize consecutive AI-like sentences (spans) and overall prevalence.
-    """
     if flag_pct >= 85 and longest_span >= 6 and avg_ai_prob >= 0.80:
         return "⚠️ Highly likely AI-generated (long consecutive spans and high prevalence)."
     if flag_pct >= 60 and longest_span >= 4:
@@ -99,9 +119,7 @@ def classify_sentences(text, ai_threshold=0.70, batch_size=64, max_len=512):
         return [], [], 0.0, 0.0, (0, 0)
     all_probs = []
-    all_labels = []
-    for chunk, base in batched(sents, n=batch_size):
         inputs = tokenizer(
             chunk,
             return_tensors="pt",
@@ -111,20 +129,17 @@ def classify_sentences(text, ai_threshold=0.70, batch_size=64, max_len=512):
         ).to(device)
         with torch.no_grad():
             logits = model(**inputs).logits
-            probs = F.softmax(logits, dim=-1)  # [:, 0]=Human, [:, 1]=AI
-        ai_probs = probs[:, 1].detach().cpu().tolist()
-        all_probs.extend(ai_probs)
-    for p in all_probs:
-        all_labels.append("AI" if p >= ai_threshold else "Human")
     avg_ai_prob = float(sum(all_probs) / len(all_probs))
-    flagged_pct = 100.0 * sum(1 for l in all_labels if l == "AI") / len(all_labels)
-    spans = contig_spans(all_labels)
     rows = []
-    for i, (s, p, lab) in enumerate(zip(sents, all_probs, all_labels), start=1):
         rows.append({
             "Sentence #": i,
             "Sentence": s,
@@ -135,15 +150,12 @@ def classify_sentences(text, ai_threshold=0.70, batch_size=64, max_len=512):
     return sents, rows, avg_ai_prob, flagged_pct, spans
 # -----------------------------
-# HTML HIGHLIGHT (Turnitin-ish)
 # -----------------------------
 def color_for_prob(p):
-    # 0-0.3 green, 0.3-0.7 yellow, 0.7-1.0 red
-    if p < 0.30:
-        return "#11823b"
-    if p < 0.70:
-        return "#b8860b"
-    return "#b80d0d"
 def build_highlight_html(rows):
     blocks = []
@@ -177,8 +189,7 @@ def generate_report(text, threshold):
         f"- Sentences flagged as AI ≥ {int(threshold*100)}%: {flagged_pct:.1f}%\n"
         f"- Consecutive AI spans: {span_count} (longest: {longest_span})\n"
         f"- Verdict: {verdict}\n"
-        f"\nⓘ This is an approximation using an open detector; "
-        f"actual Turnitin results may differ."
     )
     html = build_highlight_html(rows)

 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
 # -----------------------------
+# SENTENCE SPLITTER (no lookbehinds)
+# Protect → split → restore
 # -----------------------------
+ABBR = [
+    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
+    "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", "u.s", "u.k",
+    "a.m", "p.m"
+]
+ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
+def _protect(text: str) -> str:
+    t = text.strip()
+    if not t:
+        return ""
+    # Normalize newlines to spaces (Turnitin-like continuous flow)
+    t = re.sub(r"\s*\n+\s*", " ", t)
+    # Protect ellipses
+    t = t.replace("...", "⟨ELLIPSIS⟩")
+    # Protect decimals like 3.14
+    t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
+    # Protect known abbreviations' final dot
+    t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
+    return t
+def _restore(text: str) -> str:
+    return (text
+            .replace("⟨ABBRDOT⟩", ".")
+            .replace("⟨DECIMAL⟩", ".")
+            .replace("⟨ELLIPSIS⟩", "..."))
 def sentence_split(text: str):
+    t = _protect(text)
     if not t:
         return []
+    # Split on ., ?, ! followed by whitespace and then a plausible sentence starter
+    # (quote or capital or opening paren) OR end of string.
+    parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)
+    # Rebuild sentences: regex split keeps the delimiter in alternating groups
+    sentences = []
+    buf = ""
+    for i, chunk in enumerate(parts):
+        if i % 2 == 0:
+            buf += chunk
+        else:
+            # chunk is the delimiter [.?!]
+            buf += chunk
+            sentences.append(buf.strip())
+            buf = ""
+    if buf.strip():
+        sentences.append(buf.strip())
+    # Clean/restore
+    sentences = [_restore(s).strip() for s in sentences if s.strip()]
+    return sentences
 # -----------------------------
 # UTILITIES
         yield iterable[i:i+n], i
 def contig_spans(labels):
     longest = 0
     count = 0
     run = 0
     return count, longest
 def verdict_from_stats(flag_pct, longest_span, avg_ai_prob):
     if flag_pct >= 85 and longest_span >= 6 and avg_ai_prob >= 0.80:
         return "⚠️ Highly likely AI-generated (long consecutive spans and high prevalence)."
     if flag_pct >= 60 and longest_span >= 4:
         return [], [], 0.0, 0.0, (0, 0)
     all_probs = []
+    for chunk, _ in batched(sents, n=batch_size):
         inputs = tokenizer(
             chunk,
             return_tensors="pt",
         ).to(device)
         with torch.no_grad():
             logits = model(**inputs).logits
+            probs = F.softmax(logits, dim=-1)  # [:,0]=Human, [:,1]=AI
+        all_probs.extend(probs[:, 1].detach().cpu().tolist())
+    labels = ["AI" if p >= ai_threshold else "Human" for p in all_probs]
     avg_ai_prob = float(sum(all_probs) / len(all_probs))
+    flagged_pct = 100.0 * sum(1 for l in labels if l == "AI") / len(labels)
+    spans = contig_spans(labels)
     rows = []
+    for i, (s, p, lab) in enumerate(zip(sents, all_probs, labels), start=1):
         rows.append({
             "Sentence #": i,
             "Sentence": s,
     return sents, rows, avg_ai_prob, flagged_pct, spans
 # -----------------------------
+# HTML HIGHLIGHT
 # -----------------------------
 def color_for_prob(p):
+    if p < 0.30: return "#11823b"  # green
+    if p < 0.70: return "#b8860b"  # amber
+    return "#b80d0d"               # red
 def build_highlight_html(rows):
     blocks = []
         f"- Sentences flagged as AI ≥ {int(threshold*100)}%: {flagged_pct:.1f}%\n"
         f"- Consecutive AI spans: {span_count} (longest: {longest_span})\n"
         f"- Verdict: {verdict}\n"
+        f"\nⓘ This is an approximation using an open detector; actual Turnitin results may differ."
     )
     html = build_highlight_html(rows)