Spaces:

abhi099k
/

Latest-app

Sleeping

App Files Files Community

abhi099k commited on Oct 6, 2025

Commit

09abcdc

verified ·

1 Parent(s): b55e10c

Update src/detector.py

Browse files

Files changed (1) hide show

src/detector.py +22 -90

src/detector.py CHANGED Viewed

@@ -91,16 +91,12 @@ def preprocess_text_for_detection(text: str) -> str:
     return text.strip()
 # === Core Scoring ===
-def score_texts(texts, max_len=512):
-    """Return AI probability scores (float between 0-1) for 2-class models."""
     tokenizer, config, model = get_components()
-    # Handle single string input
-    if isinstance(texts, str):
-        texts = [texts]
     encoded = tokenizer(
-        texts,
         padding=True,
         truncation=True,
         max_length=max_len,
@@ -111,64 +107,12 @@ def score_texts(texts, max_len=512):
     encoded.pop("token_type_ids", None)
     with torch.no_grad():
-        logits = model(**encoded).logits  # shape: [batch, 2]
         probs = torch.softmax(logits, dim=-1).cpu().numpy()
     # Extract AI probability (label=1)
-    ai_probs = [float(p[1]) for p in probs]
-    return ai_probs
-# === Threshold Calibration ===
-def calibrate_threshold(human_texts, calibration_proportion=0.05, max_len=512):
-    """Calibrate threshold using human text samples"""
-    if not human_texts:
-        return 0.5  # Default threshold
-    scores = score_texts(human_texts, max_len=max_len)
-    tau = np.percentile(scores, 100 * (1 - calibration_proportion))
-    return float(tau)
-# === Predictions ===
-def predict_chunks_with_tau(chunks, tau, max_len=768):
-    """Predict with custom threshold"""
-    probs = score_texts(chunks, max_len=max_len)
-    results = []
-    for text, prob in zip(chunks, probs):
-        label = "AI" if prob >= tau else "Human"
-        confidence = prob if label == "AI" else (1 - prob)
-        results.append({
-            "text": text,
-            "type": label,
-            "score": prob,
-            "confidence": confidence
-        })
-    return results
-# === Smart Chunking ===
-def smart_chunk_text(text, max_tokens=80, min_last_chunk=70):
-    """Split text into meaningful chunks for analysis"""
-    if not text or not isinstance(text, str):
-        return []
-    text = preprocess_text_for_detection(text)
-    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z("])', text)
-    chunks, buffer = [], []
-    for sent in sentences:
-        buffer.append(sent)
-        if len(buffer) == 2 or len(" ".join(buffer)) > max_tokens:
-            chunks.append(" ".join(buffer).strip())
-            buffer = []
-    if buffer:
-        chunks.append(" ".join(buffer).strip())
-    # Merge very short last chunk with previous one
-    if len(chunks) > 1 and len(chunks[-1].split()) < min_last_chunk/10:  # Adjust threshold
-        chunks[-2] = chunks[-2] + " " + chunks[-1]
-        chunks.pop(-1)
-    return chunks
 # === Artifact Detection ===
 def has_html_or_ai_artifacts(text: str) -> bool:
@@ -188,7 +132,6 @@ def analyze_text(text, threshold=0.5, chunk_size=80):
     Args:
         text (str): Input text to analyze
         threshold (float): Confidence threshold (0-1)
-        chunk_size (int): Maximum tokens per chunk
     Returns:
         dict: Analysis results
@@ -198,47 +141,36 @@ def analyze_text(text, threshold=0.5, chunk_size=80):
             "error": "No text provided",
             "overall_type": "Unknown",
             "overall_confidence": 0.0,
-            "chunks": []
         }
     try:
         # Check for AI artifacts
         has_artifacts = has_html_or_ai_artifacts(text)
-        # Chunk the text
-        chunks = smart_chunk_text(text, max_tokens=chunk_size)
-        if not chunks:
             return {
-                "overall_type": "Human",
-                "overall_confidence": 1.0,
-                "has_artifacts": has_artifacts,
-                "chunks": [],
-                "message": "Text too short or invalid for analysis"
             }
-        # Score chunks
-        chunk_results = predict_chunks_with_tau(chunks, threshold)
-        # Calculate overall score
-        ai_scores = [result["score"] for result in chunk_results]
-        avg_ai_score = np.mean(ai_scores) if ai_scores else 0.0
-        overall_type = "AI" if avg_ai_score >= threshold else "Human"
-        overall_confidence = avg_ai_score if overall_type == "AI" else (1 - avg_ai_score)
-        # Count AI vs Human chunks
-        ai_chunks = sum(1 for result in chunk_results if result["type"] == "AI")
-        human_chunks = len(chunk_results) - ai_chunks
         return {
             "overall_type": overall_type,
             "overall_confidence": float(overall_confidence),
-            "overall_score": float(avg_ai_score),
-            "has_artifacts": has_artifacts,
-            "ai_chunks": ai_chunks,
-            "human_chunks": human_chunks,
-            "total_chunks": len(chunk_results),
-            "chunks": chunk_results
         }
     except Exception as e:
@@ -246,7 +178,7 @@ def analyze_text(text, threshold=0.5, chunk_size=80):
             "error": f"Analysis failed: {str(e)}",
             "overall_type": "Error",
             "overall_confidence": 0.0,
-            "chunks": []
         }
 # Pre-load model when module is imported (optional)

     return text.strip()
 # === Core Scoring ===
+def score_text(text, max_len=512):
+    """Return AI probability score (float between 0-1) for the text."""
     tokenizer, config, model = get_components()
     encoded = tokenizer(
+        text,
         padding=True,
         truncation=True,
         max_length=max_len,
     encoded.pop("token_type_ids", None)
     with torch.no_grad():
+        logits = model(**encoded).logits
         probs = torch.softmax(logits, dim=-1).cpu().numpy()
     # Extract AI probability (label=1)
+    ai_prob = float(probs[0][1])
+    return ai_prob
 # === Artifact Detection ===
 def has_html_or_ai_artifacts(text: str) -> bool:
     Args:
         text (str): Input text to analyze
         threshold (float): Confidence threshold (0-1)
     Returns:
         dict: Analysis results
             "error": "No text provided",
             "overall_type": "Unknown",
             "overall_confidence": 0.0,
+            "overall_score": 0.0
         }
     try:
         # Check for AI artifacts
         has_artifacts = has_html_or_ai_artifacts(text)
+        # Preprocess text
+        processed_text = preprocess_text_for_detection(text)
+        if not processed_text:
             return {
+                "error": "Text too short or invalid after preprocessing",
+                "overall_type": "Unknown",
+                "overall_confidence": 0.0,
+                "overall_score": 0.0
             }
+        # Score the text
+        ai_score = score_text(processed_text)
+        # Determine overall type and confidence
+        overall_type = "AI" if ai_score >= threshold else "Human"
+        overall_confidence = ai_score if overall_type == "AI" else (1 - ai_score)
         return {
             "overall_type": overall_type,
             "overall_confidence": float(overall_confidence),
+            "overall_score": float(ai_score),
+            "has_artifacts": has_artifacts
         }
     except Exception as e:
             "error": f"Analysis failed: {str(e)}",
             "overall_type": "Error",
             "overall_confidence": 0.0,
+            "overall_score": 0.0
         }
 # Pre-load model when module is imported (optional)