VictorM-Coder commited on
Commit
814a384
·
verified ·
1 Parent(s): 7f4b27e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -37
app.py CHANGED
@@ -10,31 +10,24 @@ import gradio as gr
10
  # -----------------------------
11
  MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
-
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
- dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32
16
-
17
- model = AutoModelForSequenceClassification.from_pretrained(
18
- MODEL_NAME, dtype=dtype
19
- ).to(device).eval()
20
 
21
  # -----------------------------
22
  # AI DECISION THRESHOLD (80%)
23
  # -----------------------------
24
- THRESHOLD = 0.80
25
 
26
  # -----------------------------
27
  # SENTENCE SPLITTING UTILITIES
28
  # -----------------------------
29
  ABBR = [
30
  "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
31
- "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
32
  "u.s", "u.k", "a.m", "p.m"
33
  ]
34
- ABBR_REGEX = re.compile(
35
- r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.",
36
- flags=re.IGNORECASE
37
- )
38
 
39
  def _protect(text: str) -> str:
40
  t = text.strip()
@@ -47,19 +40,16 @@ def _protect(text: str) -> str:
47
  return t
48
 
49
  def _restore(text: str) -> str:
50
- return (
51
- text.replace("⟨ABBRDOT⟩", ".")
52
  .replace("⟨DECIMAL⟩", ".")
53
- .replace("⟨ELLIPSIS⟩", "...")
54
- )
55
 
56
  def sentence_split(text: str):
57
  t = _protect(text)
58
  if not t:
59
  return []
60
-
61
- # hard sentence boundary detection
62
- parts = re.split(r"([.?!])\s+(?=[\"“”‘’']?\s*[A-Z(]|$)", t)
63
 
64
  sentences, buf = [], ""
65
  for i, chunk in enumerate(parts):
@@ -75,49 +65,63 @@ def sentence_split(text: str):
75
 
76
  return [_restore(s).strip() for s in sentences if s.strip()]
77
 
 
 
 
 
 
 
 
 
78
 
79
  # -----------------------------
80
- # CORE ANALYSIS PER SENTENCE
81
  # -----------------------------
82
  def analyze(text, max_len=512):
83
  sents = sentence_split(text)
84
  if not sents:
85
  return "—", "—", "<em>Paste some text to analyze.</em>", None
86
 
87
- clean_sents = [re.sub(r"\s+", " ", s).strip() for s in sents]
 
 
88
 
89
- # tokenize list of sentences
90
  inputs = tokenizer(
91
- clean_sents,
92
- return_tensors="pt",
93
- padding=True,
94
- truncation=True,
95
- max_length=max_len
96
  ).to(device)
97
 
98
- # model inference (per sentence)
99
  with torch.no_grad():
100
  logits = model(**inputs).logits
101
- probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()
 
 
 
 
 
 
 
 
102
 
103
  # overall AI score
104
- overall_ai = sum(probs) / len(probs)
105
  overall_pct = f"{overall_ai * 100:.1f}%"
106
 
107
  overall_label = (
108
  "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written"
109
  )
110
 
111
- # highlights + table
112
  rows, highlights = [], []
113
 
114
- for i, sentence in enumerate(sents, start=1):
115
- ai_p = float(probs[i - 1])
116
  pct = f"{ai_p * 100:.1f}%"
117
 
118
  label = "AI" if ai_p >= THRESHOLD else "Human"
119
 
120
- # colors
121
  if ai_p < 0.30:
122
  color = "#11823b"
123
  elif ai_p < 0.70:
@@ -125,7 +129,7 @@ def analyze(text, max_len=512):
125
  else:
126
  color = "#b80d0d"
127
 
128
- normalized = re.sub(r"\s+", " ", sentence)
129
 
130
  highlights.append(
131
  "<div style='margin:6px 0; padding:6px 8px; border-radius:6px;"
@@ -134,14 +138,13 @@ def analyze(text, max_len=512):
134
  f"{normalized}</div>"
135
  )
136
 
137
- rows.append([i, sentence, round(ai_p, 4), label])
138
 
139
  df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
140
  html = "\n".join(highlights)
141
 
142
  return overall_label, overall_pct, html, df
143
 
144
-
145
  # -----------------------------
146
  # GRADIO UI
147
  # -----------------------------
 
10
  # -----------------------------
11
  MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+ dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
15
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
 
 
 
16
 
17
  # -----------------------------
18
  # AI DECISION THRESHOLD (80%)
19
  # -----------------------------
20
+ THRESHOLD = 0.80 # AI from 80% and above
21
 
22
  # -----------------------------
23
  # SENTENCE SPLITTING UTILITIES
24
  # -----------------------------
25
  ABBR = [
26
  "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
27
+ "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
28
  "u.s", "u.k", "a.m", "p.m"
29
  ]
30
+ ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
 
 
 
31
 
32
  def _protect(text: str) -> str:
33
  t = text.strip()
 
40
  return t
41
 
42
  def _restore(text: str) -> str:
43
+ return (text
44
+ .replace("⟨ABBRDOT⟩", ".")
45
  .replace("⟨DECIMAL⟩", ".")
46
+ .replace("⟨ELLIPSIS⟩", "..."))
 
47
 
48
  def sentence_split(text: str):
49
  t = _protect(text)
50
  if not t:
51
  return []
52
+ parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)
 
 
53
 
54
  sentences, buf = [], ""
55
  for i, chunk in enumerate(parts):
 
65
 
66
  return [_restore(s).strip() for s in sentences if s.strip()]
67
 
68
+ # -----------------------------
69
+ # GROUP SENTENCES (TURNITIN STYLE)
70
+ # -----------------------------
71
+ def group_sentences(sents, size=3):
72
+ grouped = []
73
+ for i in range(0, len(sents), size):
74
+ grouped.append(" ".join(sents[i:i+size]))
75
+ return grouped
76
 
77
  # -----------------------------
78
+ # CORE ANALYSIS (3 SENTENCE WINDOWS)
79
  # -----------------------------
80
  def analyze(text, max_len=512):
81
  sents = sentence_split(text)
82
  if not sents:
83
  return "—", "—", "<em>Paste some text to analyze.</em>", None
84
 
85
+ # GROUP sentences (3 at a time)
86
+ grouped = group_sentences(sents, size=3)
87
+ clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
88
 
89
+ # tokenize grouped chunks
90
  inputs = tokenizer(
91
+ clean_grouped, return_tensors="pt",
92
+ padding=True, truncation=True, max_length=max_len
 
 
 
93
  ).to(device)
94
 
95
+ # model inference
96
  with torch.no_grad():
97
  logits = model(**inputs).logits
98
+ chunk_probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()
99
+
100
+ # EXPAND chunk-level probabilities to per-sentence (each chunk contributes to its 3 sentences)
101
+ ai_probs = []
102
+ for idx, prob in enumerate(chunk_probs):
103
+ start = idx * 3
104
+ end = min(start + 3, len(sents))
105
+ for _ in range(start, end):
106
+ ai_probs.append(prob)
107
 
108
  # overall AI score
109
+ overall_ai = sum(ai_probs) / len(ai_probs)
110
  overall_pct = f"{overall_ai * 100:.1f}%"
111
 
112
  overall_label = (
113
  "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written"
114
  )
115
 
116
+ # HIGHLIGHTS + TABLE
117
  rows, highlights = [], []
118
 
119
+ for i, orig in enumerate(sents, start=1):
120
+ ai_p = float(ai_probs[i-1])
121
  pct = f"{ai_p * 100:.1f}%"
122
 
123
  label = "AI" if ai_p >= THRESHOLD else "Human"
124
 
 
125
  if ai_p < 0.30:
126
  color = "#11823b"
127
  elif ai_p < 0.70:
 
129
  else:
130
  color = "#b80d0d"
131
 
132
+ normalized = re.sub(r"\s+", " ", orig)
133
 
134
  highlights.append(
135
  "<div style='margin:6px 0; padding:6px 8px; border-radius:6px;"
 
138
  f"{normalized}</div>"
139
  )
140
 
141
+ rows.append([i, orig, round(ai_p, 4), label])
142
 
143
  df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
144
  html = "\n".join(highlights)
145
 
146
  return overall_label, overall_pct, html, df
147
 
 
148
  # -----------------------------
149
  # GRADIO UI
150
  # -----------------------------