VictorM-Coder commited on
Commit
c1d0bb0
·
verified ·
1 Parent(s): 38debf0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -26
app.py CHANGED
@@ -17,25 +17,19 @@ def get_model():
17
  global tokenizer, model
18
  if model is None:
19
  print(f"Loading Specialized Model: {MODEL_NAME} on {device}")
20
-
21
- # DeBERTa-v3 requires use_fast=False for stable tokenization.
22
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
23
-
24
- # We load as Sequence Classification with 1 label (Single Logit).
25
- # ignore_mismatched_sizes=True allows us to load the custom Desklib head.
26
  model = AutoModelForSequenceClassification.from_pretrained(
27
  MODEL_NAME,
28
  num_labels=1,
29
  ignore_mismatched_sizes=True
30
  ).to(device).eval()
31
-
32
  return tokenizer, model
33
 
34
- # Only 81% and above is flagged as AI
35
  THRESHOLD = 0.81
36
 
37
  # -----------------------------
38
- # UTILITIES (Regex & Structure)
39
  # -----------------------------
40
  ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
41
  ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
@@ -94,7 +88,6 @@ def analyze(text):
94
  if not pure_sents:
95
  return "—", "—", "<em>No sentences detected.</em>", None
96
 
97
- # Contextual Sliding Window
98
  windows = []
99
  for i in range(len(pure_sents)):
100
  start = max(0, i - 1)
@@ -103,15 +96,15 @@ def analyze(text):
103
 
104
  inputs = tok(windows, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
105
  output = mod(**inputs)
106
-
107
- # Since num_labels=1, we use Sigmoid on the single logit per window
108
  probs = torch.sigmoid(output.logits).cpu().numpy().flatten().tolist()
109
 
110
  lengths = [len(s.split()) for s in pure_sents]
111
  total_words = sum(lengths)
112
  weighted_avg = sum(p * l for p, l in zip(probs, lengths)) / total_words if total_words > 0 else 0
113
 
114
- # HTML Heatmap
 
 
115
  highlighted_html = "<div style='font-family: sans-serif; line-height: 1.8;'>"
116
  prob_map = {idx: probs[i] for i, idx in enumerate(pure_sents_indices)}
117
 
@@ -122,6 +115,7 @@ def analyze(text):
122
 
123
  if i in prob_map:
124
  score = prob_map[i]
 
125
  if score >= THRESHOLD:
126
  color, bg = "#b80d0d", "rgba(184, 13, 13, 0.15)" # RED
127
  else:
@@ -129,37 +123,34 @@ def analyze(text):
129
 
130
  highlighted_html += (
131
  f"<span style='background:{bg}; padding:2px 4px; border-radius:4px; border-bottom: 2px solid {color};' "
132
- f"title='AI Probability: {score:.1%}'>"
133
- f"<b style='color:{color}; font-size: 0.8em;'>[{score:.0%}]</b> {block}</span>"
134
  )
135
  else:
136
  highlighted_html += block
137
  highlighted_html += "</div>"
138
 
139
- if weighted_avg >= THRESHOLD:
140
- label = f"{weighted_avg:.0%} AI Content Detected"
141
- display_score = f"{weighted_avg:.1%}"
142
- else:
143
- label = "0 or * AI Content Detected"
144
- display_score = "*"
145
 
146
- df = pd.DataFrame({"Sentence": pure_sents, "AI Confidence": [f"{p:.1%}" for p in probs]})
147
  return label, display_score, highlighted_html, df
148
 
149
  # -----------------------------
150
  # GRADIO INTERFACE
151
  # -----------------------------
152
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
153
- gr.Markdown("## 🕵️ AI Detector Pro (Academic Edition)")
154
- gr.Markdown(f"Using **{MODEL_NAME}**. Threshold: **{THRESHOLD*100:.0f}%**.")
155
 
156
  with gr.Row():
157
  with gr.Column(scale=3):
158
- text_input = gr.Textbox(label="Paste Text", lines=12, placeholder="Minimum 300 words...")
159
  run_btn = gr.Button("Analyze", variant="primary")
160
  with gr.Column(scale=1):
161
- verdict_out = gr.Label(label="Verdict")
162
- score_out = gr.Label(label="Weighted AI Score")
163
 
164
  with gr.Tabs():
165
  with gr.TabItem("Visual Heatmap"):
 
17
  global tokenizer, model
18
  if model is None:
19
  print(f"Loading Specialized Model: {MODEL_NAME} on {device}")
 
 
20
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
 
 
 
21
  model = AutoModelForSequenceClassification.from_pretrained(
22
  MODEL_NAME,
23
  num_labels=1,
24
  ignore_mismatched_sizes=True
25
  ).to(device).eval()
 
26
  return tokenizer, model
27
 
28
+ # Threshold used ONLY for coloring (Red vs Green)
29
  THRESHOLD = 0.81
30
 
31
  # -----------------------------
32
+ # UTILITIES
33
  # -----------------------------
34
  ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
35
  ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
 
88
  if not pure_sents:
89
  return "—", "—", "<em>No sentences detected.</em>", None
90
 
 
91
  windows = []
92
  for i in range(len(pure_sents)):
93
  start = max(0, i - 1)
 
96
 
97
  inputs = tok(windows, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
98
  output = mod(**inputs)
 
 
99
  probs = torch.sigmoid(output.logits).cpu().numpy().flatten().tolist()
100
 
101
  lengths = [len(s.split()) for s in pure_sents]
102
  total_words = sum(lengths)
103
  weighted_avg = sum(p * l for p, l in zip(probs, lengths)) / total_words if total_words > 0 else 0
104
 
105
+ # -----------------------------
106
+ # HTML RECONSTRUCTION (Unfiltered Probabilities)
107
+ # -----------------------------
108
  highlighted_html = "<div style='font-family: sans-serif; line-height: 1.8;'>"
109
  prob_map = {idx: probs[i] for i, idx in enumerate(pure_sents_indices)}
110
 
 
115
 
116
  if i in prob_map:
117
  score = prob_map[i]
118
+ # Color is still determined by the 81% threshold for visual aid
119
  if score >= THRESHOLD:
120
  color, bg = "#b80d0d", "rgba(184, 13, 13, 0.15)" # RED
121
  else:
 
123
 
124
  highlighted_html += (
125
  f"<span style='background:{bg}; padding:2px 4px; border-radius:4px; border-bottom: 2px solid {color};' "
126
+ f"title='Raw Model Score: {score:.4f}'>"
127
+ f"<b style='color:{color}; font-size: 0.8em;'>[{score:.1%}]</b> {block}</span>"
128
  )
129
  else:
130
  highlighted_html += block
131
  highlighted_html += "</div>"
132
 
133
+ # --- RAW RESULTS (No Masking) ---
134
+ label = f"{weighted_avg:.1%} AI Probability"
135
+ display_score = f"{weighted_avg:.2%}"
 
 
 
136
 
137
+ df = pd.DataFrame({"Sentence": pure_sents, "AI Confidence": [f"{p:.2%}" for p in probs]})
138
  return label, display_score, highlighted_html, df
139
 
140
  # -----------------------------
141
  # GRADIO INTERFACE
142
  # -----------------------------
143
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
144
+ gr.Markdown("## 🕵️ AI Detector Pro: Raw Mode")
145
+ gr.Markdown(f"Direct model output from **{MODEL_NAME}**. Visual highlight still triggers at **{THRESHOLD*100:.0f}%**.")
146
 
147
  with gr.Row():
148
  with gr.Column(scale=3):
149
+ text_input = gr.Textbox(label="Paste Text", lines=12, placeholder="Min 300 words...")
150
  run_btn = gr.Button("Analyze", variant="primary")
151
  with gr.Column(scale=1):
152
+ verdict_out = gr.Label(label="Model Verdict (Raw)")
153
+ score_out = gr.Label(label="Exact Weighted Probability")
154
 
155
  with gr.Tabs():
156
  with gr.TabItem("Visual Heatmap"):