abhi099k commited on
Commit
09abcdc
·
verified ·
1 Parent(s): b55e10c

Update src/detector.py

Browse files
Files changed (1) hide show
  1. src/detector.py +22 -90
src/detector.py CHANGED
@@ -91,16 +91,12 @@ def preprocess_text_for_detection(text: str) -> str:
91
  return text.strip()
92
 
93
  # === Core Scoring ===
94
- def score_texts(texts, max_len=512):
95
- """Return AI probability scores (float between 0-1) for 2-class models."""
96
  tokenizer, config, model = get_components()
97
 
98
- # Handle single string input
99
- if isinstance(texts, str):
100
- texts = [texts]
101
-
102
  encoded = tokenizer(
103
- texts,
104
  padding=True,
105
  truncation=True,
106
  max_length=max_len,
@@ -111,64 +107,12 @@ def score_texts(texts, max_len=512):
111
  encoded.pop("token_type_ids", None)
112
 
113
  with torch.no_grad():
114
- logits = model(**encoded).logits # shape: [batch, 2]
115
  probs = torch.softmax(logits, dim=-1).cpu().numpy()
116
 
117
  # Extract AI probability (label=1)
118
- ai_probs = [float(p[1]) for p in probs]
119
- return ai_probs
120
-
121
- # === Threshold Calibration ===
122
- def calibrate_threshold(human_texts, calibration_proportion=0.05, max_len=512):
123
- """Calibrate threshold using human text samples"""
124
- if not human_texts:
125
- return 0.5 # Default threshold
126
-
127
- scores = score_texts(human_texts, max_len=max_len)
128
- tau = np.percentile(scores, 100 * (1 - calibration_proportion))
129
- return float(tau)
130
-
131
- # === Predictions ===
132
- def predict_chunks_with_tau(chunks, tau, max_len=768):
133
- """Predict with custom threshold"""
134
- probs = score_texts(chunks, max_len=max_len)
135
- results = []
136
- for text, prob in zip(chunks, probs):
137
- label = "AI" if prob >= tau else "Human"
138
- confidence = prob if label == "AI" else (1 - prob)
139
- results.append({
140
- "text": text,
141
- "type": label,
142
- "score": prob,
143
- "confidence": confidence
144
- })
145
- return results
146
-
147
- # === Smart Chunking ===
148
- def smart_chunk_text(text, max_tokens=80, min_last_chunk=70):
149
- """Split text into meaningful chunks for analysis"""
150
- if not text or not isinstance(text, str):
151
- return []
152
-
153
- text = preprocess_text_for_detection(text)
154
- sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z("])', text)
155
- chunks, buffer = [], []
156
-
157
- for sent in sentences:
158
- buffer.append(sent)
159
- if len(buffer) == 2 or len(" ".join(buffer)) > max_tokens:
160
- chunks.append(" ".join(buffer).strip())
161
- buffer = []
162
-
163
- if buffer:
164
- chunks.append(" ".join(buffer).strip())
165
-
166
- # Merge very short last chunk with previous one
167
- if len(chunks) > 1 and len(chunks[-1].split()) < min_last_chunk/10: # Adjust threshold
168
- chunks[-2] = chunks[-2] + " " + chunks[-1]
169
- chunks.pop(-1)
170
-
171
- return chunks
172
 
173
  # === Artifact Detection ===
174
  def has_html_or_ai_artifacts(text: str) -> bool:
@@ -188,7 +132,6 @@ def analyze_text(text, threshold=0.5, chunk_size=80):
188
  Args:
189
  text (str): Input text to analyze
190
  threshold (float): Confidence threshold (0-1)
191
- chunk_size (int): Maximum tokens per chunk
192
 
193
  Returns:
194
  dict: Analysis results
@@ -198,47 +141,36 @@ def analyze_text(text, threshold=0.5, chunk_size=80):
198
  "error": "No text provided",
199
  "overall_type": "Unknown",
200
  "overall_confidence": 0.0,
201
- "chunks": []
202
  }
203
 
204
  try:
205
  # Check for AI artifacts
206
  has_artifacts = has_html_or_ai_artifacts(text)
207
 
208
- # Chunk the text
209
- chunks = smart_chunk_text(text, max_tokens=chunk_size)
210
 
211
- if not chunks:
212
  return {
213
- "overall_type": "Human",
214
- "overall_confidence": 1.0,
215
- "has_artifacts": has_artifacts,
216
- "chunks": [],
217
- "message": "Text too short or invalid for analysis"
218
  }
219
 
220
- # Score chunks
221
- chunk_results = predict_chunks_with_tau(chunks, threshold)
222
-
223
- # Calculate overall score
224
- ai_scores = [result["score"] for result in chunk_results]
225
- avg_ai_score = np.mean(ai_scores) if ai_scores else 0.0
226
- overall_type = "AI" if avg_ai_score >= threshold else "Human"
227
- overall_confidence = avg_ai_score if overall_type == "AI" else (1 - avg_ai_score)
228
 
229
- # Count AI vs Human chunks
230
- ai_chunks = sum(1 for result in chunk_results if result["type"] == "AI")
231
- human_chunks = len(chunk_results) - ai_chunks
232
 
233
  return {
234
  "overall_type": overall_type,
235
  "overall_confidence": float(overall_confidence),
236
- "overall_score": float(avg_ai_score),
237
- "has_artifacts": has_artifacts,
238
- "ai_chunks": ai_chunks,
239
- "human_chunks": human_chunks,
240
- "total_chunks": len(chunk_results),
241
- "chunks": chunk_results
242
  }
243
 
244
  except Exception as e:
@@ -246,7 +178,7 @@ def analyze_text(text, threshold=0.5, chunk_size=80):
246
  "error": f"Analysis failed: {str(e)}",
247
  "overall_type": "Error",
248
  "overall_confidence": 0.0,
249
- "chunks": []
250
  }
251
 
252
  # Pre-load model when module is imported (optional)
 
91
  return text.strip()
92
 
93
  # === Core Scoring ===
94
+ def score_text(text, max_len=512):
95
+ """Return AI probability score (float between 0-1) for the text."""
96
  tokenizer, config, model = get_components()
97
 
 
 
 
 
98
  encoded = tokenizer(
99
+ text,
100
  padding=True,
101
  truncation=True,
102
  max_length=max_len,
 
107
  encoded.pop("token_type_ids", None)
108
 
109
  with torch.no_grad():
110
+ logits = model(**encoded).logits
111
  probs = torch.softmax(logits, dim=-1).cpu().numpy()
112
 
113
  # Extract AI probability (label=1)
114
+ ai_prob = float(probs[0][1])
115
+ return ai_prob
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  # === Artifact Detection ===
118
  def has_html_or_ai_artifacts(text: str) -> bool:
 
132
  Args:
133
  text (str): Input text to analyze
134
  threshold (float): Confidence threshold (0-1)
 
135
 
136
  Returns:
137
  dict: Analysis results
 
141
  "error": "No text provided",
142
  "overall_type": "Unknown",
143
  "overall_confidence": 0.0,
144
+ "overall_score": 0.0
145
  }
146
 
147
  try:
148
  # Check for AI artifacts
149
  has_artifacts = has_html_or_ai_artifacts(text)
150
 
151
+ # Preprocess text
152
+ processed_text = preprocess_text_for_detection(text)
153
 
154
+ if not processed_text:
155
  return {
156
+ "error": "Text too short or invalid after preprocessing",
157
+ "overall_type": "Unknown",
158
+ "overall_confidence": 0.0,
159
+ "overall_score": 0.0
 
160
  }
161
 
162
+ # Score the text
163
+ ai_score = score_text(processed_text)
 
 
 
 
 
 
164
 
165
+ # Determine overall type and confidence
166
+ overall_type = "AI" if ai_score >= threshold else "Human"
167
+ overall_confidence = ai_score if overall_type == "AI" else (1 - ai_score)
168
 
169
  return {
170
  "overall_type": overall_type,
171
  "overall_confidence": float(overall_confidence),
172
+ "overall_score": float(ai_score),
173
+ "has_artifacts": has_artifacts
 
 
 
 
174
  }
175
 
176
  except Exception as e:
 
178
  "error": f"Analysis failed: {str(e)}",
179
  "overall_type": "Error",
180
  "overall_confidence": 0.0,
181
+ "overall_score": 0.0
182
  }
183
 
184
  # Pre-load model when module is imported (optional)