VictorM-Coder commited on
Commit
7f4b27e
·
verified ·
1 Parent(s): 23b2adf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -46
app.py CHANGED
@@ -10,24 +10,31 @@ import gradio as gr
10
  # -----------------------------
11
  MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
- dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
15
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
 
 
 
16
 
17
  # -----------------------------
18
  # AI DECISION THRESHOLD (80%)
19
  # -----------------------------
20
- THRESHOLD = 0.80 # AI from 80% and above
21
 
22
  # -----------------------------
23
  # SENTENCE SPLITTING UTILITIES
24
  # -----------------------------
25
  ABBR = [
26
  "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
27
- "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
28
  "u.s", "u.k", "a.m", "p.m"
29
  ]
30
- ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
 
 
 
31
 
32
  def _protect(text: str) -> str:
33
  t = text.strip()
@@ -40,16 +47,19 @@ def _protect(text: str) -> str:
40
  return t
41
 
42
  def _restore(text: str) -> str:
43
- return (text
44
- .replace("⟨ABBRDOT⟩", ".")
45
  .replace("⟨DECIMAL⟩", ".")
46
- .replace("⟨ELLIPSIS⟩", "..."))
 
47
 
48
  def sentence_split(text: str):
49
  t = _protect(text)
50
  if not t:
51
  return []
52
- parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)
 
 
53
 
54
  sentences, buf = [], ""
55
  for i, chunk in enumerate(parts):
@@ -65,74 +75,57 @@ def sentence_split(text: str):
65
 
66
  return [_restore(s).strip() for s in sentences if s.strip()]
67
 
68
- # -----------------------------
69
- # GROUP SENTENCES (TURNITIN STYLE)
70
- # -----------------------------
71
- def group_sentences(sents, size=3):
72
- grouped = []
73
- for i in range(0, len(sents), size):
74
- grouped.append(" ".join(sents[i:i+size]))
75
- return grouped
76
 
77
  # -----------------------------
78
- # CORE ANALYSIS
79
  # -----------------------------
80
  def analyze(text, max_len=512):
81
  sents = sentence_split(text)
82
  if not sents:
83
  return "—", "—", "<em>Paste some text to analyze.</em>", None
84
 
85
- # GROUP sentences (3 at a time)
86
- grouped = group_sentences(sents, size=3)
87
- clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
88
 
89
- # tokenize grouped
90
  inputs = tokenizer(
91
- clean_grouped, return_tensors="pt",
92
- padding=True, truncation=True, max_length=max_len
 
 
 
93
  ).to(device)
94
 
95
- # model inference
96
  with torch.no_grad():
97
  logits = model(**inputs).logits
98
- chunk_probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()
99
-
100
- # EXPAND chunk-level probabilities to per-sentence
101
- ai_probs = []
102
- for idx, prob in enumerate(chunk_probs):
103
- start = idx * 3
104
- end = min(start + 3, len(sents))
105
- for _ in range(start, end):
106
- ai_probs.append(prob)
107
 
108
  # overall AI score
109
- overall_ai = sum(ai_probs) / len(ai_probs)
110
  overall_pct = f"{overall_ai * 100:.1f}%"
111
 
112
- # UPDATED THRESHOLD (80%)
113
  overall_label = (
114
  "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written"
115
  )
116
 
117
- # HIGHLIGHTS + TABLE
118
  rows, highlights = [], []
119
 
120
- for i, orig in enumerate(sents, start=1):
121
- ai_p = float(ai_probs[i-1])
122
  pct = f"{ai_p * 100:.1f}%"
123
 
124
- # UPDATED → label decided by 80%
125
  label = "AI" if ai_p >= THRESHOLD else "Human"
126
 
127
- # color logic (unchanged)
128
  if ai_p < 0.30:
129
- color = "#11823b" # green
130
  elif ai_p < 0.70:
131
- color = "#b8860b" # amber
132
  else:
133
- color = "#b80d0d" # red
134
 
135
- normalized = re.sub(r"\s+", " ", orig)
136
 
137
  highlights.append(
138
  "<div style='margin:6px 0; padding:6px 8px; border-radius:6px;"
@@ -141,13 +134,14 @@ def analyze(text, max_len=512):
141
  f"{normalized}</div>"
142
  )
143
 
144
- rows.append([i, orig, round(ai_p, 4), label])
145
 
146
  df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
147
  html = "\n".join(highlights)
148
 
149
  return overall_label, overall_pct, html, df
150
 
 
151
  # -----------------------------
152
  # GRADIO UI
153
  # -----------------------------
 
10
  # -----------------------------
11
  MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
+
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32
16
+
17
+ model = AutoModelForSequenceClassification.from_pretrained(
18
+ MODEL_NAME, dtype=dtype
19
+ ).to(device).eval()
20
 
21
  # -----------------------------
22
  # AI DECISION THRESHOLD (80%)
23
  # -----------------------------
24
+ THRESHOLD = 0.80
25
 
26
  # -----------------------------
27
  # SENTENCE SPLITTING UTILITIES
28
  # -----------------------------
29
  ABBR = [
30
  "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
31
+ "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
32
  "u.s", "u.k", "a.m", "p.m"
33
  ]
34
+ ABBR_REGEX = re.compile(
35
+ r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.",
36
+ flags=re.IGNORECASE
37
+ )
38
 
39
  def _protect(text: str) -> str:
40
  t = text.strip()
 
47
  return t
48
 
49
  def _restore(text: str) -> str:
50
+ return (
51
+ text.replace("⟨ABBRDOT⟩", ".")
52
  .replace("⟨DECIMAL⟩", ".")
53
+ .replace("⟨ELLIPSIS⟩", "...")
54
+ )
55
 
56
  def sentence_split(text: str):
57
  t = _protect(text)
58
  if not t:
59
  return []
60
+
61
+ # hard sentence boundary detection
62
+ parts = re.split(r"([.?!])\s+(?=[\"“”‘’']?\s*[A-Z(]|$)", t)
63
 
64
  sentences, buf = [], ""
65
  for i, chunk in enumerate(parts):
 
75
 
76
  return [_restore(s).strip() for s in sentences if s.strip()]
77
 
 
 
 
 
 
 
 
 
78
 
79
  # -----------------------------
80
+ # CORE ANALYSIS — PER SENTENCE
81
  # -----------------------------
82
  def analyze(text, max_len=512):
83
  sents = sentence_split(text)
84
  if not sents:
85
  return "—", "—", "<em>Paste some text to analyze.</em>", None
86
 
87
+ clean_sents = [re.sub(r"\s+", " ", s).strip() for s in sents]
 
 
88
 
89
+ # tokenize list of sentences
90
  inputs = tokenizer(
91
+ clean_sents,
92
+ return_tensors="pt",
93
+ padding=True,
94
+ truncation=True,
95
+ max_length=max_len
96
  ).to(device)
97
 
98
+ # model inference (per sentence)
99
  with torch.no_grad():
100
  logits = model(**inputs).logits
101
+ probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()
 
 
 
 
 
 
 
 
102
 
103
  # overall AI score
104
+ overall_ai = sum(probs) / len(probs)
105
  overall_pct = f"{overall_ai * 100:.1f}%"
106
 
 
107
  overall_label = (
108
  "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written"
109
  )
110
 
111
+ # highlights + table
112
  rows, highlights = [], []
113
 
114
+ for i, sentence in enumerate(sents, start=1):
115
+ ai_p = float(probs[i - 1])
116
  pct = f"{ai_p * 100:.1f}%"
117
 
 
118
  label = "AI" if ai_p >= THRESHOLD else "Human"
119
 
120
+ # colors
121
  if ai_p < 0.30:
122
+ color = "#11823b"
123
  elif ai_p < 0.70:
124
+ color = "#b8860b"
125
  else:
126
+ color = "#b80d0d"
127
 
128
+ normalized = re.sub(r"\s+", " ", sentence)
129
 
130
  highlights.append(
131
  "<div style='margin:6px 0; padding:6px 8px; border-radius:6px;"
 
134
  f"{normalized}</div>"
135
  )
136
 
137
+ rows.append([i, sentence, round(ai_p, 4), label])
138
 
139
  df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
140
  html = "\n".join(highlights)
141
 
142
  return overall_label, overall_pct, html, df
143
 
144
+
145
  # -----------------------------
146
  # GRADIO UI
147
  # -----------------------------