VictorM-Coder commited on
Commit
72d2f9a
·
verified ·
1 Parent(s): 0d83dcd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -58
app.py CHANGED
@@ -13,45 +13,70 @@ MODEL_NAME = "openai-community/roberta-base-openai-detector"
13
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
  dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
16
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, torch_dtype=dtype).to(device).eval()
17
 
18
  # -----------------------------
19
- # SENTENCE SPLITTER (robust, no externals)
 
20
  # -----------------------------
21
- _ABBR = r"(?:e\.g|i\.e|mr|mrs|ms|dr|prof|vs|etc|fig|al|jr|sr|st|no|vol|pp|mt|inc|ltd|co|u\.s|u\.k|a\.m|p\.m)\."
22
- _QUOTE = r"[\"“”‘’']?"
23
- # Split on ., ?, ! when followed by space/newline + a capital/quote or end of text,
24
- # while avoiding common abbreviations and decimals.
25
- _SENT_PAT = re.compile(
26
- rf"""
27
- (?<!\b{_ABBR}) # not common abbreviation
28
- (?<!\d)\.|\?|! # ., ?, !
29
- (?=\s+{_QUOTE}[A-Z(]|$) # lookahead for next sentence start or end
30
- """,
31
- re.VERBOSE
32
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def sentence_split(text: str):
35
- # Normalize hard breaks to spaces (Turnitin-like continuous flow)
36
- t = re.sub(r"\s*\n+\s*", " ", text.strip())
37
  if not t:
38
  return []
39
- # Temporarily protect ellipses to avoid over-splitting
40
- t = t.replace("...", "…")
41
- pieces = []
42
- start = 0
43
- for m in _SENT_PAT.finditer(t):
44
- end = m.end()
45
- chunk = t[start:end].strip()
46
- if chunk:
47
- pieces.append(chunk)
48
- start = end
49
- # tail
50
- tail = t[start:].strip()
51
- if tail:
52
- pieces.append(tail)
53
- # Restore ellipses
54
- return [s.replace("…", "...") for s in pieces]
 
 
 
 
 
 
55
 
56
  # -----------------------------
57
  # UTILITIES
@@ -61,7 +86,6 @@ def batched(iterable, n=64):
61
  yield iterable[i:i+n], i
62
 
63
  def contig_spans(labels):
64
- """Return (num_spans, longest_span_len) for consecutive 'AI' labels."""
65
  longest = 0
66
  count = 0
67
  run = 0
@@ -78,10 +102,6 @@ def contig_spans(labels):
78
  return count, longest
79
 
80
  def verdict_from_stats(flag_pct, longest_span, avg_ai_prob):
81
- """
82
- Turnitin-ish qualitative summary.
83
- - Emphasize consecutive AI-like sentences (spans) and overall prevalence.
84
- """
85
  if flag_pct >= 85 and longest_span >= 6 and avg_ai_prob >= 0.80:
86
  return "⚠️ Highly likely AI-generated (long consecutive spans and high prevalence)."
87
  if flag_pct >= 60 and longest_span >= 4:
@@ -99,9 +119,7 @@ def classify_sentences(text, ai_threshold=0.70, batch_size=64, max_len=512):
99
  return [], [], 0.0, 0.0, (0, 0)
100
 
101
  all_probs = []
102
- all_labels = []
103
-
104
- for chunk, base in batched(sents, n=batch_size):
105
  inputs = tokenizer(
106
  chunk,
107
  return_tensors="pt",
@@ -111,20 +129,17 @@ def classify_sentences(text, ai_threshold=0.70, batch_size=64, max_len=512):
111
  ).to(device)
112
  with torch.no_grad():
113
  logits = model(**inputs).logits
114
- probs = F.softmax(logits, dim=-1) # [:, 0]=Human, [:, 1]=AI
115
-
116
- ai_probs = probs[:, 1].detach().cpu().tolist()
117
- all_probs.extend(ai_probs)
118
 
119
- for p in all_probs:
120
- all_labels.append("AI" if p >= ai_threshold else "Human")
121
 
122
  avg_ai_prob = float(sum(all_probs) / len(all_probs))
123
- flagged_pct = 100.0 * sum(1 for l in all_labels if l == "AI") / len(all_labels)
124
- spans = contig_spans(all_labels)
125
 
126
  rows = []
127
- for i, (s, p, lab) in enumerate(zip(sents, all_probs, all_labels), start=1):
128
  rows.append({
129
  "Sentence #": i,
130
  "Sentence": s,
@@ -135,15 +150,12 @@ def classify_sentences(text, ai_threshold=0.70, batch_size=64, max_len=512):
135
  return sents, rows, avg_ai_prob, flagged_pct, spans
136
 
137
  # -----------------------------
138
- # HTML HIGHLIGHT (Turnitin-ish)
139
  # -----------------------------
140
  def color_for_prob(p):
141
- # 0-0.3 green, 0.3-0.7 yellow, 0.7-1.0 red
142
- if p < 0.30:
143
- return "#11823b"
144
- if p < 0.70:
145
- return "#b8860b"
146
- return "#b80d0d"
147
 
148
  def build_highlight_html(rows):
149
  blocks = []
@@ -177,8 +189,7 @@ def generate_report(text, threshold):
177
  f"- Sentences flagged as AI ≥ {int(threshold*100)}%: {flagged_pct:.1f}%\n"
178
  f"- Consecutive AI spans: {span_count} (longest: {longest_span})\n"
179
  f"- Verdict: {verdict}\n"
180
- f"\nⓘ This is an approximation using an open detector; "
181
- f"actual Turnitin results may differ."
182
  )
183
 
184
  html = build_highlight_html(rows)
 
13
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
  dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
16
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
17
 
18
  # -----------------------------
19
+ # SENTENCE SPLITTER (no lookbehinds)
20
+ # Protect → split → restore
21
  # -----------------------------
22
+ ABBR = [
23
+ "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
24
+ "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", "u.s", "u.k",
25
+ "a.m", "p.m"
26
+ ]
27
+ ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
28
+
29
+ def _protect(text: str) -> str:
30
+ t = text.strip()
31
+ if not t:
32
+ return ""
33
+
34
+ # Normalize newlines to spaces (Turnitin-like continuous flow)
35
+ t = re.sub(r"\s*\n+\s*", " ", t)
36
+
37
+ # Protect ellipses
38
+ t = t.replace("...", "⟨ELLIPSIS⟩")
39
+
40
+ # Protect decimals like 3.14
41
+ t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
42
+
43
+ # Protect known abbreviations' final dot
44
+ t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
45
+
46
+ return t
47
+
48
+ def _restore(text: str) -> str:
49
+ return (text
50
+ .replace("⟨ABBRDOT⟩", ".")
51
+ .replace("⟨DECIMAL⟩", ".")
52
+ .replace("⟨ELLIPSIS⟩", "..."))
53
 
54
  def sentence_split(text: str):
55
+ t = _protect(text)
 
56
  if not t:
57
  return []
58
+
59
+ # Split on ., ?, ! followed by whitespace and then a plausible sentence starter
60
+ # (quote or capital or opening paren) OR end of string.
61
+ parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)
62
+
63
+ # Rebuild sentences: regex split keeps the delimiter in alternating groups
64
+ sentences = []
65
+ buf = ""
66
+ for i, chunk in enumerate(parts):
67
+ if i % 2 == 0:
68
+ buf += chunk
69
+ else:
70
+ # chunk is the delimiter [.?!]
71
+ buf += chunk
72
+ sentences.append(buf.strip())
73
+ buf = ""
74
+ if buf.strip():
75
+ sentences.append(buf.strip())
76
+
77
+ # Clean/restore
78
+ sentences = [_restore(s).strip() for s in sentences if s.strip()]
79
+ return sentences
80
 
81
  # -----------------------------
82
  # UTILITIES
 
86
  yield iterable[i:i+n], i
87
 
88
  def contig_spans(labels):
 
89
  longest = 0
90
  count = 0
91
  run = 0
 
102
  return count, longest
103
 
104
  def verdict_from_stats(flag_pct, longest_span, avg_ai_prob):
 
 
 
 
105
  if flag_pct >= 85 and longest_span >= 6 and avg_ai_prob >= 0.80:
106
  return "⚠️ Highly likely AI-generated (long consecutive spans and high prevalence)."
107
  if flag_pct >= 60 and longest_span >= 4:
 
119
  return [], [], 0.0, 0.0, (0, 0)
120
 
121
  all_probs = []
122
+ for chunk, _ in batched(sents, n=batch_size):
 
 
123
  inputs = tokenizer(
124
  chunk,
125
  return_tensors="pt",
 
129
  ).to(device)
130
  with torch.no_grad():
131
  logits = model(**inputs).logits
132
+ probs = F.softmax(logits, dim=-1) # [:,0]=Human, [:,1]=AI
133
+ all_probs.extend(probs[:, 1].detach().cpu().tolist())
 
 
134
 
135
+ labels = ["AI" if p >= ai_threshold else "Human" for p in all_probs]
 
136
 
137
  avg_ai_prob = float(sum(all_probs) / len(all_probs))
138
+ flagged_pct = 100.0 * sum(1 for l in labels if l == "AI") / len(labels)
139
+ spans = contig_spans(labels)
140
 
141
  rows = []
142
+ for i, (s, p, lab) in enumerate(zip(sents, all_probs, labels), start=1):
143
  rows.append({
144
  "Sentence #": i,
145
  "Sentence": s,
 
150
  return sents, rows, avg_ai_prob, flagged_pct, spans
151
 
152
  # -----------------------------
153
+ # HTML HIGHLIGHT
154
  # -----------------------------
155
  def color_for_prob(p):
156
+ if p < 0.30: return "#11823b" # green
157
+ if p < 0.70: return "#b8860b" # amber
158
+ return "#b80d0d" # red
 
 
 
159
 
160
  def build_highlight_html(rows):
161
  blocks = []
 
189
  f"- Sentences flagged as AI ≥ {int(threshold*100)}%: {flagged_pct:.1f}%\n"
190
  f"- Consecutive AI spans: {span_count} (longest: {longest_span})\n"
191
  f"- Verdict: {verdict}\n"
192
+ f"\nⓘ This is an approximation using an open detector; actual Turnitin results may differ."
 
193
  )
194
 
195
  html = build_highlight_html(rows)