Spaces:

abhi099k
/

Latest-app

Sleeping

App Files Files Community

abhi099k commited on Oct 6, 2025

Commit

8e274e9

verified ·

1 Parent(s): c9a6b37

Create detector.py

Browse files

Files changed (1) hide show

src/detector.py +100 -0

src/detector.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
+import numpy as np
+import re
+MODEL_DIR = "abhi099k/ai-text-detector-v-n4.0"  # Your fine-tuned model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# === Load tokenizer and config ===
+tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
+config = AutoConfig.from_pretrained(MODEL_DIR)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config).to(device)
+model.eval()
+# === Preprocessing: Normalize + Flatten ===
+def preprocess_text_for_detection(text: str) -> str:
+    """
+    Convert structured notes (bullets, lists) into clean sentences for AI detection.
+    """
+    # Replace bullets / dashes with periods
+    text = re.sub(r"[\n•\-–]+", ". ", text)
+    # Remove multiple spaces
+    text = re.sub(r"\s+", " ", text)
+    # Ensure consistent punctuation spacing
+    text = re.sub(r"\s*([,.!?;:])\s*", r"\1 ", text)
+    return text.strip()
+# === Core Scoring ===
+def score_texts(texts, max_len=512):
+    """Return AI probability scores (float between 0-1) for 2-class models."""
+    encoded = tokenizer(
+        texts,
+        padding=True,
+        truncation=True,
+        max_length=max_len,
+        return_tensors="pt"
+    ).to(device)
+    # Some models may not need token_type_ids
+    encoded.pop("token_type_ids", None)
+    with torch.no_grad():
+        logits = model(**encoded).logits  # shape: [batch, 2]
+        probs = torch.softmax(logits, dim=-1).cpu().numpy()
+    # Extract AI probability (label=1)
+    ai_probs = [float(p[1]) for p in probs]
+    return ai_probs
+# === Threshold Calibration ===
+def calibrate_threshold(human_texts, calibration_proportion=0.05, max_len=512):
+    scores = score_texts(human_texts, max_len=max_len)
+    tau = np.percentile(scores, 100 * (1 - calibration_proportion))
+    return tau
+# === Predictions ===
+def predict_chunks_with_tau(chunks, tau, max_len=768):
+    probs = score_texts(chunks, max_len=max_len)
+    results = []
+    for text, prob in zip(chunks, probs):
+        label = "AI" if prob >= tau else "Human"
+        results.append({"text": text, "type": label, "score": prob})
+    return results
+# === Smart Chunking ===
+def smart_chunk_text(text, max_tokens=80, min_last_chunk=70):
+    text = preprocess_text_for_detection(text)
+    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z(“"])', text)
+    chunks, buffer = [], []
+    for sent in sentences:
+        buffer.append(sent)
+        if len(buffer) == 2 or len(" ".join(buffer)) > max_tokens:
+            chunks.append(" ".join(buffer).strip())
+            buffer = []
+    if buffer:
+        chunks.append(" ".join(buffer).strip())
+    if len(chunks) > 1 and len(chunks[-1]) < min_last_chunk:
+        chunks[-2] = chunks[-2] + " " + chunks[-1]
+        chunks.pop(-1)
+    return chunks
+# === Artifact Detection ===
+def has_html_or_ai_artifacts(text: str) -> bool:
+    """Detect HTML tags or attributes typical of copy-pasted AI output."""
+    html_pattern = re.compile(r'<[^>]+>')
+    data_attr_pattern = re.compile(r'data-(start|end)=["\']?\d+')
+    return bool(html_pattern.search(text) or data_attr_pattern.search(text))