VictorM-Coder commited on
Commit
ea83121
·
verified ·
1 Parent(s): 6d8431a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -25
app.py CHANGED
@@ -15,12 +15,12 @@ dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported(
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
16
 
17
  # -----------------------------
18
- # INTERNAL THRESHOLD for sentence labels/colors
19
  # -----------------------------
20
- THRESHOLD = 0.70 # used only for per-sentence "AI/Human" tags & color
21
 
22
  # -----------------------------
23
- # SENTENCE SPLITTER (protect → split → restore; no lookbehinds)
24
  # -----------------------------
25
  ABBR = [
26
  "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
@@ -33,10 +33,10 @@ def _protect(text: str) -> str:
33
  t = text.strip()
34
  if not t:
35
  return ""
36
- t = re.sub(r"\s*\n+\s*", " ", t) # normalize newlines
37
- t = t.replace("...", "⟨ELLIPSIS⟩") # ellipses
38
- t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t) # decimals like 3.14
39
- t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t) # abbreviations' dot
40
  return t
41
 
42
  def _restore(text: str) -> str:
@@ -49,7 +49,6 @@ def sentence_split(text: str):
49
  t = _protect(text)
50
  if not t:
51
  return []
52
- # split on [.?!] followed by whitespace and likely sentence start or end
53
  parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)
54
  sentences, buf = [], ""
55
  for i, chunk in enumerate(parts):
@@ -57,63 +56,95 @@ def sentence_split(text: str):
57
  buf += chunk
58
  else:
59
  buf += chunk
60
- sentences.append(buf.strip()); buf = ""
 
61
  if buf.strip():
62
  sentences.append(buf.strip())
63
  return [_restore(s).strip() for s in sentences if s.strip()]
64
 
65
  # -----------------------------
66
- # CORE: overall AI score + highlights
 
 
 
 
 
 
 
 
 
 
67
  # -----------------------------
68
  def analyze(text, max_len=512):
69
  sents = sentence_split(text)
70
  if not sents:
71
  return "—", "—", "<em>Paste some text to analyze.</em>", None
72
 
73
- # light clean (per model card vibe)
74
- clean_sents = [re.sub(r"\s+", " ", s).strip() for s in sents]
 
75
 
 
76
  inputs = tokenizer(
77
- clean_sents, return_tensors="pt", padding=True, truncation=True, max_length=max_len
 
78
  ).to(device)
79
 
 
80
  with torch.no_grad():
81
  logits = model(**inputs).logits
82
- probs = F.softmax(logits, dim=-1) # [:,0]=Human, [:,1]=AI
83
 
84
- ai_probs = probs[:, 1].detach().cpu().tolist()
 
 
 
 
 
 
 
 
85
  overall_ai = sum(ai_probs) / len(ai_probs)
86
  overall_pct = f"{overall_ai * 100:.1f}%"
87
- overall_label = "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written"
 
 
88
 
89
- # Per-sentence highlights (use THRESHOLD only for the tag/color)
90
  rows, highlights = [], []
91
  for i, orig in enumerate(sents, start=1):
92
  ai_p = float(ai_probs[i-1])
93
  label = "AI" if ai_p >= THRESHOLD else "Human"
94
  pct = f"{ai_p*100:.1f}%"
 
 
95
  if ai_p < 0.30:
96
- color = "#11823b" # green
97
  elif ai_p < 0.70:
98
- color = "#b8860b" # amber
99
  else:
100
- color = "#b80d0d" # red
 
101
  normalized = re.sub(r"\s+", " ", orig)
102
  highlights.append(
103
- "<div style='margin:6px 0; padding:6px 8px; border-radius:6px; background:rgba(0,0,0,0.03)'>"
104
- f"<strong style='color:{color}'>[{pct} {label}]</strong> {normalized}</div>"
 
 
105
  )
 
106
  rows.append([i, orig, round(ai_p, 4), label])
107
 
108
- html = "\n".join(highlights)
109
  df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
 
 
110
  return overall_label, overall_pct, html, df
111
 
112
  # -----------------------------
113
- # GRADIO UI (verdict + score, plus highlights)
114
  # -----------------------------
115
  with gr.Blocks() as demo:
116
- gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model")
117
 
118
  text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
119
  btn = gr.Button("Analyze")
 
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
16
 
17
  # -----------------------------
18
+ # THRESHOLD FOR LABEL COLOR
19
  # -----------------------------
20
+ THRESHOLD = 0.70
21
 
22
  # -----------------------------
23
+ # SENTENCE SPLITTING UTILITIES
24
  # -----------------------------
25
  ABBR = [
26
  "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
 
33
  t = text.strip()
34
  if not t:
35
  return ""
36
+ t = re.sub(r"\s*\n+\s*", " ", t)
37
+ t = t.replace("...", "⟨ELLIPSIS⟩")
38
+ t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
39
+ t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
40
  return t
41
 
42
  def _restore(text: str) -> str:
 
49
  t = _protect(text)
50
  if not t:
51
  return []
 
52
  parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)
53
  sentences, buf = [], ""
54
  for i, chunk in enumerate(parts):
 
56
  buf += chunk
57
  else:
58
  buf += chunk
59
+ sentences.append(buf.strip())
60
+ buf = ""
61
  if buf.strip():
62
  sentences.append(buf.strip())
63
  return [_restore(s).strip() for s in sentences if s.strip()]
64
 
65
  # -----------------------------
66
+ # GROUP SENTENCES (TURNITIN STYLE)
67
+ # -----------------------------
68
+ def group_sentences(sents, size=3):
69
+ grouped = []
70
+ for i in range(0, len(sents), size):
71
+ chunk = " ".join(sents[i:i+size])
72
+ grouped.append(chunk)
73
+ return grouped
74
+
75
+ # -----------------------------
76
+ # CORE ANALYSIS
77
  # -----------------------------
78
  def analyze(text, max_len=512):
79
  sents = sentence_split(text)
80
  if not sents:
81
  return "—", "—", "<em>Paste some text to analyze.</em>", None
82
 
83
+ # GROUP sentences into 3-sentence chunks
84
+ grouped = group_sentences(sents, size=3)
85
+ clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
86
 
87
+ # tokenize grouped
88
  inputs = tokenizer(
89
+ clean_grouped, return_tensors="pt", padding=True,
90
+ truncation=True, max_length=max_len
91
  ).to(device)
92
 
93
+ # model inference
94
  with torch.no_grad():
95
  logits = model(**inputs).logits
96
+ chunk_probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()
97
 
98
+ # EXPAND chunk-level probabilities back to per-sentence
99
+ ai_probs = []
100
+ for idx, prob in enumerate(chunk_probs):
101
+ start = idx * 3
102
+ end = min(start + 3, len(sents))
103
+ for _ in range(start, end):
104
+ ai_probs.append(prob)
105
+
106
+ # overall AI score
107
  overall_ai = sum(ai_probs) / len(ai_probs)
108
  overall_pct = f"{overall_ai * 100:.1f}%"
109
+ overall_label = (
110
+ "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written"
111
+ )
112
 
113
+ # HIGHLIGHTS + TABLE
114
  rows, highlights = [], []
115
  for i, orig in enumerate(sents, start=1):
116
  ai_p = float(ai_probs[i-1])
117
  label = "AI" if ai_p >= THRESHOLD else "Human"
118
  pct = f"{ai_p*100:.1f}%"
119
+
120
+ # color logic
121
  if ai_p < 0.30:
122
+ color = "#11823b"
123
  elif ai_p < 0.70:
124
+ color = "#b8860b"
125
  else:
126
+ color = "#b80d0d"
127
+
128
  normalized = re.sub(r"\s+", " ", orig)
129
  highlights.append(
130
+ "<div style='margin:6px 0; padding:6px 8px; border-radius:6px; "
131
+ "background:rgba(0,0,0,0.03)'>"
132
+ f"<strong style='color:{color}'>[{pct} {label}]</strong> "
133
+ f"{normalized}</div>"
134
  )
135
+
136
  rows.append([i, orig, round(ai_p, 4), label])
137
 
 
138
  df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
139
+ html = "\n".join(highlights)
140
+
141
  return overall_label, overall_pct, html, df
142
 
143
  # -----------------------------
144
+ # GRADIO UI
145
  # -----------------------------
146
  with gr.Blocks() as demo:
147
+ gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model (Turnitin-Style)")
148
 
149
  text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
150
  btn = gr.Button("Analyze")