VictorM-Coder commited on
Commit
38debf0
·
verified ·
1 Parent(s): dfecc14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -34
app.py CHANGED
@@ -1,35 +1,10 @@
1
  import torch
2
- import torch.nn as nn
3
  import torch.nn.functional as F
4
- from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel
5
  import re
6
  import pandas as pd
7
  import gradio as gr
8
 
9
- # -----------------------------
10
- # CUSTOM MODEL DEFINITION
11
- # -----------------------------
12
- # The Desklib model uses a custom architecture: Mean Pooling + Linear Classifier.
13
- class DesklibAIDetectionModel(PreTrainedModel):
14
- config_class = AutoConfig
15
- def __init__(self, config):
16
- super().__init__(config)
17
- self.model = AutoModel.from_config(config)
18
- self.classifier = nn.Linear(config.hidden_size, 1)
19
- self.init_weights()
20
-
21
- def forward(self, input_ids, attention_mask=None):
22
- outputs = self.model(input_ids, attention_mask=attention_mask)
23
- last_hidden_state = outputs[0]
24
- # Mean Pooling logic
25
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
26
- sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
27
- sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
28
- mean_pooled = sum_embeddings / sum_mask
29
-
30
- logits = self.classifier(mean_pooled)
31
- return logits
32
-
33
  # -----------------------------
34
  # MODEL INITIALIZATION
35
  # -----------------------------
@@ -42,19 +17,25 @@ def get_model():
42
  global tokenizer, model
43
  if model is None:
44
  print(f"Loading Specialized Model: {MODEL_NAME} on {device}")
 
 
45
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
46
 
47
- # Load the weights into our custom class
48
- model = DesklibAIDetectionModel.from_pretrained(
 
49
  MODEL_NAME,
50
- torch_dtype=torch.float32 # Use float16/bfloat16 if your GPU supports it
 
51
  ).to(device).eval()
 
52
  return tokenizer, model
53
 
 
54
  THRESHOLD = 0.81
55
 
56
  # -----------------------------
57
- # UTILITIES (Sentence Splitting & Structure)
58
  # -----------------------------
59
  ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
60
  ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
@@ -113,6 +94,7 @@ def analyze(text):
113
  if not pure_sents:
114
  return "—", "—", "<em>No sentences detected.</em>", None
115
 
 
116
  windows = []
117
  for i in range(len(pure_sents)):
118
  start = max(0, i - 1)
@@ -120,10 +102,10 @@ def analyze(text):
120
  windows.append(" ".join(pure_sents[start:end]))
121
 
122
  inputs = tok(windows, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
123
- logits = mod(inputs['input_ids'], inputs['attention_mask'])
124
 
125
- # Sigmoid for single-logit probability
126
- probs = torch.sigmoid(logits).cpu().numpy().flatten().tolist()
127
 
128
  lengths = [len(s.split()) for s in pure_sents]
129
  total_words = sum(lengths)
@@ -169,7 +151,7 @@ def analyze(text):
169
  # -----------------------------
170
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
171
  gr.Markdown("## 🕵️ AI Detector Pro (Academic Edition)")
172
- gr.Markdown(f"Using **{MODEL_NAME}** (DeBERTa-v3-Large). Threshold: **{THRESHOLD*100:.0f}%**.")
173
 
174
  with gr.Row():
175
  with gr.Column(scale=3):
 
1
  import torch
 
2
  import torch.nn.functional as F
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import re
5
  import pandas as pd
6
  import gradio as gr
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # -----------------------------
9
  # MODEL INITIALIZATION
10
  # -----------------------------
 
17
  global tokenizer, model
18
  if model is None:
19
  print(f"Loading Specialized Model: {MODEL_NAME} on {device}")
20
+
21
+ # DeBERTa-v3 requires use_fast=False for stable tokenization.
22
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
23
 
24
+ # We load as Sequence Classification with 1 label (Single Logit).
25
+ # ignore_mismatched_sizes=True allows us to load the custom Desklib head.
26
+ model = AutoModelForSequenceClassification.from_pretrained(
27
  MODEL_NAME,
28
+ num_labels=1,
29
+ ignore_mismatched_sizes=True
30
  ).to(device).eval()
31
+
32
  return tokenizer, model
33
 
34
+ # Only 81% and above is flagged as AI
35
  THRESHOLD = 0.81
36
 
37
  # -----------------------------
38
+ # UTILITIES (Regex & Structure)
39
  # -----------------------------
40
  ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
41
  ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
 
94
  if not pure_sents:
95
  return "—", "—", "<em>No sentences detected.</em>", None
96
 
97
+ # Contextual Sliding Window
98
  windows = []
99
  for i in range(len(pure_sents)):
100
  start = max(0, i - 1)
 
102
  windows.append(" ".join(pure_sents[start:end]))
103
 
104
  inputs = tok(windows, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
105
+ output = mod(**inputs)
106
 
107
+ # Since num_labels=1, we use Sigmoid on the single logit per window
108
+ probs = torch.sigmoid(output.logits).cpu().numpy().flatten().tolist()
109
 
110
  lengths = [len(s.split()) for s in pure_sents]
111
  total_words = sum(lengths)
 
151
  # -----------------------------
152
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
153
  gr.Markdown("## 🕵️ AI Detector Pro (Academic Edition)")
154
+ gr.Markdown(f"Using **{MODEL_NAME}**. Threshold: **{THRESHOLD*100:.0f}%**.")
155
 
156
  with gr.Row():
157
  with gr.Column(scale=3):