VictorM-Coder commited on
Commit
fdd45e5
·
verified ·
1 Parent(s): 8d27116

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -32
app.py CHANGED
@@ -11,7 +11,7 @@ import gradio as gr
11
  MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
- dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
16
 
17
  THRESHOLD = 0.80
@@ -20,18 +20,21 @@ THRESHOLD = 0.80
20
  # ABBREVIATION PROTECTION
21
  # -----------------------------
22
  ABBR = [
23
- "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
24
- "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
25
- "u.s", "u.k", "a.m", "p.m"
26
  ]
 
27
  ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
28
 
 
29
  def _protect(text):
30
  text = text.replace("...", "⟨ELLIPSIS⟩")
31
  text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
32
  text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
33
  return text
34
 
 
35
  def _restore(text):
36
  return (
37
  text.replace("⟨ABBRDOT⟩", ".")
@@ -39,6 +42,7 @@ def _restore(text):
39
  .replace("⟨ELLIPSIS⟩", "...")
40
  )
41
 
 
42
  # -----------------------------
43
  # PERFECT PARAGRAPH-PRESERVING SPLITTER
44
  # -----------------------------
@@ -55,30 +59,32 @@ def split_preserving_structure(text):
55
 
56
  for i in range(0, len(parts), 3):
57
  sentence = parts[i]
58
- punct = parts[i+1] if i+1 < len(parts) else ""
59
- space = parts[i+2] if i+2 < len(parts) else ""
60
 
61
  whole = sentence + punct
62
  if whole.strip():
63
  final_blocks.append(_restore(whole))
64
-
65
  if space:
66
  final_blocks.append(space)
67
 
68
  return final_blocks
69
 
 
70
  def extract_sentences_only(blocks):
71
  return [
72
  b for b in blocks
73
  if b.strip() != "" and not b.startswith("\n") and not b.isspace()
74
  ]
75
 
 
76
  # -----------------------------
77
  # GROUPING
78
  # -----------------------------
79
  def group_sentences(sents, size=3):
80
  return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
81
 
 
82
  # -----------------------------
83
  # ANALYSIS LOGIC
84
  # -----------------------------
@@ -90,19 +96,21 @@ def analyze(text, max_len=512):
90
  if not pure_sentences:
91
  return "—", "—", "<em>Paste text to analyze.</em>", None
92
 
 
93
  grouped = group_sentences(pure_sentences, 3)
94
  clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
95
 
96
- # Run model
97
  inputs = tokenizer(clean_grouped, return_tensors="pt",
98
  padding=True, truncation=True,
99
  max_length=max_len).to(device)
100
 
101
  with torch.no_grad():
102
  logits = model(**inputs).logits
103
- chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
104
 
105
- # Expand grouped probs to each sentence
 
 
106
  ai_scores = []
107
  for idx, prob in enumerate(chunk_probs):
108
  start = idx * 3
@@ -111,54 +119,49 @@ def analyze(text, max_len=512):
111
  ai_scores.append(prob)
112
 
113
  # -----------------------------
114
- # COLOR HIGHLIGHTING (FULL SENTENCE BLOCK COLORING)
115
  # -----------------------------
116
  highlighted = ""
117
- current_sentence = 0
118
 
119
  for block in blocks:
120
-
121
- # newline blocks
122
  if block.startswith("\n"):
123
  highlighted += block
124
  continue
125
 
126
- # whitespace blocks
127
  if block.isspace():
128
  highlighted += block
129
  continue
130
 
131
- # real sentence
132
- ai_p = ai_scores[current_sentence]
133
- current_sentence += 1
 
 
 
 
134
  pct = f"{ai_p * 100:.1f}%"
135
 
136
- # COLOR LEVELS (background + text)
137
  if ai_p < 0.30:
138
- bg = "rgba(17,130,59,0.18)" # green
139
- color = "#0f5e2e"
140
  elif ai_p < 0.70:
141
- bg = "rgba(184,134,11,0.23)" # yellow
142
- color = "#7a5f00"
143
  else:
144
- bg = "rgba(184,13,13,0.20)" # red
145
- color = "#7a0000"
146
 
147
  highlighted += (
148
- f"<span style='background:{bg}; padding:5px 8px; "
149
- f"border-radius:6px; display:inline-block; margin-bottom:4px;'>"
150
- f"<strong style='color:{color}'>[{pct}]</strong> "
151
  f"{block.strip()}</span> "
152
  )
153
 
154
  # -----------------------------
155
- # OVERALL
156
  # -----------------------------
157
  overall = sum(ai_scores) / len(ai_scores)
158
  overall_pct = f"{overall * 100:.1f}%"
159
  overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
160
 
161
- # Table
162
  df = pd.DataFrame(
163
  [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
164
  columns=["#", "Sentence", "AI_Prob"]
@@ -166,11 +169,12 @@ def analyze(text, max_len=512):
166
 
167
  return overall_label, overall_pct, highlighted, df
168
 
 
169
  # -----------------------------
170
- # GRADIO UI
171
  # -----------------------------
172
  with gr.Blocks() as demo:
173
- gr.Markdown("### 🕵️ AI Sentence-Level Detector — Color Highlight Mode")
174
 
175
  text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
176
  btn = gr.Button("Analyze")
 
11
  MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+ dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
16
 
17
  THRESHOLD = 0.80
 
20
  # ABBREVIATION PROTECTION
21
  # -----------------------------
22
  ABBR = [
23
+ "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc",
24
+ "fig", "al", "jr", "sr", "st", "no", "vol", "pp", "mt",
25
+ "inc", "ltd", "co", "u.s", "u.k", "a.m", "p.m"
26
  ]
27
+
28
  ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
29
 
30
+
31
  def _protect(text):
32
  text = text.replace("...", "⟨ELLIPSIS⟩")
33
  text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
34
  text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
35
  return text
36
 
37
+
38
  def _restore(text):
39
  return (
40
  text.replace("⟨ABBRDOT⟩", ".")
 
42
  .replace("⟨ELLIPSIS⟩", "...")
43
  )
44
 
45
+
46
  # -----------------------------
47
  # PERFECT PARAGRAPH-PRESERVING SPLITTER
48
  # -----------------------------
 
59
 
60
  for i in range(0, len(parts), 3):
61
  sentence = parts[i]
62
+ punct = parts[i + 1] if i + 1 < len(parts) else ""
63
+ space = parts[i + 2] if i + 2 < len(parts) else ""
64
 
65
  whole = sentence + punct
66
  if whole.strip():
67
  final_blocks.append(_restore(whole))
 
68
  if space:
69
  final_blocks.append(space)
70
 
71
  return final_blocks
72
 
73
+
74
  def extract_sentences_only(blocks):
75
  return [
76
  b for b in blocks
77
  if b.strip() != "" and not b.startswith("\n") and not b.isspace()
78
  ]
79
 
80
+
81
  # -----------------------------
82
  # GROUPING
83
  # -----------------------------
84
  def group_sentences(sents, size=3):
85
  return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
86
 
87
+
88
  # -----------------------------
89
  # ANALYSIS LOGIC
90
  # -----------------------------
 
96
  if not pure_sentences:
97
  return "—", "—", "<em>Paste text to analyze.</em>", None
98
 
99
+ # Group into 3-sentence windows
100
  grouped = group_sentences(pure_sentences, 3)
101
  clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
102
 
103
+ # Model forward pass
104
  inputs = tokenizer(clean_grouped, return_tensors="pt",
105
  padding=True, truncation=True,
106
  max_length=max_len).to(device)
107
 
108
  with torch.no_grad():
109
  logits = model(**inputs).logits
 
110
 
111
+ chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
112
+
113
+ # expand back
114
  ai_scores = []
115
  for idx, prob in enumerate(chunk_probs):
116
  start = idx * 3
 
119
  ai_scores.append(prob)
120
 
121
  # -----------------------------
122
+ # RECONSTRUCTION WITH HIGHLIGHT
123
  # -----------------------------
124
  highlighted = ""
125
+ sentence_index = 0
126
 
127
  for block in blocks:
 
 
128
  if block.startswith("\n"):
129
  highlighted += block
130
  continue
131
 
 
132
  if block.isspace():
133
  highlighted += block
134
  continue
135
 
136
+ # safety
137
+ if sentence_index >= len(ai_scores):
138
+ ai_p = ai_scores[-1]
139
+ else:
140
+ ai_p = ai_scores[sentence_index]
141
+ sentence_index += 1
142
+
143
  pct = f"{ai_p * 100:.1f}%"
144
 
 
145
  if ai_p < 0.30:
146
+ color = "#11823b"
 
147
  elif ai_p < 0.70:
148
+ color = "#b8860b"
 
149
  else:
150
+ color = "#b80d0d"
 
151
 
152
  highlighted += (
153
+ f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
154
+ f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
 
155
  f"{block.strip()}</span> "
156
  )
157
 
158
  # -----------------------------
159
+ # OVERALL SCORE
160
  # -----------------------------
161
  overall = sum(ai_scores) / len(ai_scores)
162
  overall_pct = f"{overall * 100:.1f}%"
163
  overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
164
 
 
165
  df = pd.DataFrame(
166
  [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
167
  columns=["#", "Sentence", "AI_Prob"]
 
169
 
170
  return overall_label, overall_pct, highlighted, df
171
 
172
+
173
  # -----------------------------
174
+ # UI
175
  # -----------------------------
176
  with gr.Blocks() as demo:
177
+ gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")
178
 
179
  text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
180
  btn = gr.Button("Analyze")