Spaces:
Sleeping
Sleeping
Update src/detector.py
Browse files- src/detector.py +22 -90
src/detector.py
CHANGED
|
@@ -91,16 +91,12 @@ def preprocess_text_for_detection(text: str) -> str:
|
|
| 91 |
return text.strip()
|
| 92 |
|
| 93 |
# === Core Scoring ===
|
| 94 |
-
def
|
| 95 |
-
"""Return AI probability
|
| 96 |
tokenizer, config, model = get_components()
|
| 97 |
|
| 98 |
-
# Handle single string input
|
| 99 |
-
if isinstance(texts, str):
|
| 100 |
-
texts = [texts]
|
| 101 |
-
|
| 102 |
encoded = tokenizer(
|
| 103 |
-
|
| 104 |
padding=True,
|
| 105 |
truncation=True,
|
| 106 |
max_length=max_len,
|
|
@@ -111,64 +107,12 @@ def score_texts(texts, max_len=512):
|
|
| 111 |
encoded.pop("token_type_ids", None)
|
| 112 |
|
| 113 |
with torch.no_grad():
|
| 114 |
-
logits = model(**encoded).logits
|
| 115 |
probs = torch.softmax(logits, dim=-1).cpu().numpy()
|
| 116 |
|
| 117 |
# Extract AI probability (label=1)
|
| 118 |
-
|
| 119 |
-
return
|
| 120 |
-
|
| 121 |
-
# === Threshold Calibration ===
|
| 122 |
-
def calibrate_threshold(human_texts, calibration_proportion=0.05, max_len=512):
|
| 123 |
-
"""Calibrate threshold using human text samples"""
|
| 124 |
-
if not human_texts:
|
| 125 |
-
return 0.5 # Default threshold
|
| 126 |
-
|
| 127 |
-
scores = score_texts(human_texts, max_len=max_len)
|
| 128 |
-
tau = np.percentile(scores, 100 * (1 - calibration_proportion))
|
| 129 |
-
return float(tau)
|
| 130 |
-
|
| 131 |
-
# === Predictions ===
|
| 132 |
-
def predict_chunks_with_tau(chunks, tau, max_len=768):
|
| 133 |
-
"""Predict with custom threshold"""
|
| 134 |
-
probs = score_texts(chunks, max_len=max_len)
|
| 135 |
-
results = []
|
| 136 |
-
for text, prob in zip(chunks, probs):
|
| 137 |
-
label = "AI" if prob >= tau else "Human"
|
| 138 |
-
confidence = prob if label == "AI" else (1 - prob)
|
| 139 |
-
results.append({
|
| 140 |
-
"text": text,
|
| 141 |
-
"type": label,
|
| 142 |
-
"score": prob,
|
| 143 |
-
"confidence": confidence
|
| 144 |
-
})
|
| 145 |
-
return results
|
| 146 |
-
|
| 147 |
-
# === Smart Chunking ===
|
| 148 |
-
def smart_chunk_text(text, max_tokens=80, min_last_chunk=70):
|
| 149 |
-
"""Split text into meaningful chunks for analysis"""
|
| 150 |
-
if not text or not isinstance(text, str):
|
| 151 |
-
return []
|
| 152 |
-
|
| 153 |
-
text = preprocess_text_for_detection(text)
|
| 154 |
-
sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z("])', text)
|
| 155 |
-
chunks, buffer = [], []
|
| 156 |
-
|
| 157 |
-
for sent in sentences:
|
| 158 |
-
buffer.append(sent)
|
| 159 |
-
if len(buffer) == 2 or len(" ".join(buffer)) > max_tokens:
|
| 160 |
-
chunks.append(" ".join(buffer).strip())
|
| 161 |
-
buffer = []
|
| 162 |
-
|
| 163 |
-
if buffer:
|
| 164 |
-
chunks.append(" ".join(buffer).strip())
|
| 165 |
-
|
| 166 |
-
# Merge very short last chunk with previous one
|
| 167 |
-
if len(chunks) > 1 and len(chunks[-1].split()) < min_last_chunk/10: # Adjust threshold
|
| 168 |
-
chunks[-2] = chunks[-2] + " " + chunks[-1]
|
| 169 |
-
chunks.pop(-1)
|
| 170 |
-
|
| 171 |
-
return chunks
|
| 172 |
|
| 173 |
# === Artifact Detection ===
|
| 174 |
def has_html_or_ai_artifacts(text: str) -> bool:
|
|
@@ -188,7 +132,6 @@ def analyze_text(text, threshold=0.5, chunk_size=80):
|
|
| 188 |
Args:
|
| 189 |
text (str): Input text to analyze
|
| 190 |
threshold (float): Confidence threshold (0-1)
|
| 191 |
-
chunk_size (int): Maximum tokens per chunk
|
| 192 |
|
| 193 |
Returns:
|
| 194 |
dict: Analysis results
|
|
@@ -198,47 +141,36 @@ def analyze_text(text, threshold=0.5, chunk_size=80):
|
|
| 198 |
"error": "No text provided",
|
| 199 |
"overall_type": "Unknown",
|
| 200 |
"overall_confidence": 0.0,
|
| 201 |
-
"
|
| 202 |
}
|
| 203 |
|
| 204 |
try:
|
| 205 |
# Check for AI artifacts
|
| 206 |
has_artifacts = has_html_or_ai_artifacts(text)
|
| 207 |
|
| 208 |
-
#
|
| 209 |
-
|
| 210 |
|
| 211 |
-
if not
|
| 212 |
return {
|
| 213 |
-
"
|
| 214 |
-
"
|
| 215 |
-
"
|
| 216 |
-
"
|
| 217 |
-
"message": "Text too short or invalid for analysis"
|
| 218 |
}
|
| 219 |
|
| 220 |
-
# Score
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
# Calculate overall score
|
| 224 |
-
ai_scores = [result["score"] for result in chunk_results]
|
| 225 |
-
avg_ai_score = np.mean(ai_scores) if ai_scores else 0.0
|
| 226 |
-
overall_type = "AI" if avg_ai_score >= threshold else "Human"
|
| 227 |
-
overall_confidence = avg_ai_score if overall_type == "AI" else (1 - avg_ai_score)
|
| 228 |
|
| 229 |
-
#
|
| 230 |
-
|
| 231 |
-
|
| 232 |
|
| 233 |
return {
|
| 234 |
"overall_type": overall_type,
|
| 235 |
"overall_confidence": float(overall_confidence),
|
| 236 |
-
"overall_score": float(
|
| 237 |
-
"has_artifacts": has_artifacts
|
| 238 |
-
"ai_chunks": ai_chunks,
|
| 239 |
-
"human_chunks": human_chunks,
|
| 240 |
-
"total_chunks": len(chunk_results),
|
| 241 |
-
"chunks": chunk_results
|
| 242 |
}
|
| 243 |
|
| 244 |
except Exception as e:
|
|
@@ -246,7 +178,7 @@ def analyze_text(text, threshold=0.5, chunk_size=80):
|
|
| 246 |
"error": f"Analysis failed: {str(e)}",
|
| 247 |
"overall_type": "Error",
|
| 248 |
"overall_confidence": 0.0,
|
| 249 |
-
"
|
| 250 |
}
|
| 251 |
|
| 252 |
# Pre-load model when module is imported (optional)
|
|
|
|
| 91 |
return text.strip()
|
| 92 |
|
| 93 |
# === Core Scoring ===
|
| 94 |
+
def score_text(text, max_len=512):
|
| 95 |
+
"""Return AI probability score (float between 0-1) for the text."""
|
| 96 |
tokenizer, config, model = get_components()
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
encoded = tokenizer(
|
| 99 |
+
text,
|
| 100 |
padding=True,
|
| 101 |
truncation=True,
|
| 102 |
max_length=max_len,
|
|
|
|
| 107 |
encoded.pop("token_type_ids", None)
|
| 108 |
|
| 109 |
with torch.no_grad():
|
| 110 |
+
logits = model(**encoded).logits
|
| 111 |
probs = torch.softmax(logits, dim=-1).cpu().numpy()
|
| 112 |
|
| 113 |
# Extract AI probability (label=1)
|
| 114 |
+
ai_prob = float(probs[0][1])
|
| 115 |
+
return ai_prob
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
# === Artifact Detection ===
|
| 118 |
def has_html_or_ai_artifacts(text: str) -> bool:
|
|
|
|
| 132 |
Args:
|
| 133 |
text (str): Input text to analyze
|
| 134 |
threshold (float): Confidence threshold (0-1)
|
|
|
|
| 135 |
|
| 136 |
Returns:
|
| 137 |
dict: Analysis results
|
|
|
|
| 141 |
"error": "No text provided",
|
| 142 |
"overall_type": "Unknown",
|
| 143 |
"overall_confidence": 0.0,
|
| 144 |
+
"overall_score": 0.0
|
| 145 |
}
|
| 146 |
|
| 147 |
try:
|
| 148 |
# Check for AI artifacts
|
| 149 |
has_artifacts = has_html_or_ai_artifacts(text)
|
| 150 |
|
| 151 |
+
# Preprocess text
|
| 152 |
+
processed_text = preprocess_text_for_detection(text)
|
| 153 |
|
| 154 |
+
if not processed_text:
|
| 155 |
return {
|
| 156 |
+
"error": "Text too short or invalid after preprocessing",
|
| 157 |
+
"overall_type": "Unknown",
|
| 158 |
+
"overall_confidence": 0.0,
|
| 159 |
+
"overall_score": 0.0
|
|
|
|
| 160 |
}
|
| 161 |
|
| 162 |
+
# Score the text
|
| 163 |
+
ai_score = score_text(processed_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
+
# Determine overall type and confidence
|
| 166 |
+
overall_type = "AI" if ai_score >= threshold else "Human"
|
| 167 |
+
overall_confidence = ai_score if overall_type == "AI" else (1 - ai_score)
|
| 168 |
|
| 169 |
return {
|
| 170 |
"overall_type": overall_type,
|
| 171 |
"overall_confidence": float(overall_confidence),
|
| 172 |
+
"overall_score": float(ai_score),
|
| 173 |
+
"has_artifacts": has_artifacts
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
}
|
| 175 |
|
| 176 |
except Exception as e:
|
|
|
|
| 178 |
"error": f"Analysis failed: {str(e)}",
|
| 179 |
"overall_type": "Error",
|
| 180 |
"overall_confidence": 0.0,
|
| 181 |
+
"overall_score": 0.0
|
| 182 |
}
|
| 183 |
|
| 184 |
# Pre-load model when module is imported (optional)
|