Spaces:
Sleeping
Sleeping
File size: 6,313 Bytes
8e274e9 25121a7 8e274e9 25121a7 8e274e9 25121a7 8e274e9 25121a7 8e274e9 25121a7 8e274e9 09abcdc 25121a7 8e274e9 09abcdc 8e274e9 09abcdc 8e274e9 09abcdc 8e274e9 25121a7 8e274e9 25121a7 09abcdc 25121a7 09abcdc 25121a7 09abcdc 25121a7 09abcdc 25121a7 09abcdc 25121a7 09abcdc 25121a7 09abcdc 25121a7 09abcdc 25121a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
import numpy as np
import re
import os
import time
from pathlib import Path
# Configure cache for Hugging Face Spaces
os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache'
os.environ['HF_HOME'] = '/tmp/huggingface'
# Create cache directories
Path('/tmp/transformers_cache').mkdir(parents=True, exist_ok=True)
Path('/tmp/huggingface').mkdir(parents=True, exist_ok=True)
MODEL_DIR = "abhi099k/ai-text-detector-v-n4.0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize as None, load on first use
_tokenizer = None
_config = None
_model = None
def get_components():
"""Lazy load model components with retry logic"""
global _tokenizer, _config, _model
if _tokenizer is None:
max_retries = 3
for attempt in range(max_retries):
try:
print(f"Loading model components... (Attempt {attempt + 1}/{max_retries})")
_tokenizer = AutoTokenizer.from_pretrained(
MODEL_DIR,
cache_dir='/tmp/transformers_cache',
local_files_only=False
)
_config = AutoConfig.from_pretrained(
MODEL_DIR,
cache_dir='/tmp/transformers_cache',
local_files_only=False
)
_model = AutoModelForSequenceClassification.from_pretrained(
MODEL_DIR,
config=_config,
cache_dir='/tmp/transformers_cache',
local_files_only=False
).to(device)
_model.eval()
print("Model loaded successfully!")
break
except OSError as e:
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 2
print(f"Cache conflict detected, retrying in {wait_time} seconds...")
time.sleep(wait_time)
# Try to clear any lock files
cache_path = Path('/tmp/transformers_cache')
if cache_path.exists():
for lock_file in cache_path.glob("*.lock"):
try:
lock_file.unlink()
print(f"Removed lock file: {lock_file}")
except:
pass
else:
print(f"Failed to load model after {max_retries} attempts: {e}")
raise
return _tokenizer, _config, _model
# === Preprocessing: Normalize + Flatten ===
def preprocess_text_for_detection(text: str) -> str:
"""
Convert structured notes (bullets, lists) into clean sentences for AI detection.
"""
if not text or not isinstance(text, str):
return ""
# Replace bullets / dashes with periods
text = re.sub(r"[\n•\-–]+", ". ", text)
# Remove multiple spaces
text = re.sub(r"\s+", " ", text)
# Ensure consistent punctuation spacing
text = re.sub(r"\s*([,.!?;:])\s*", r"\1 ", text)
return text.strip()
# === Core Scoring ===
def score_text(text, max_len=512):
"""Return AI probability score (float between 0-1) for the text."""
tokenizer, config, model = get_components()
encoded = tokenizer(
text,
padding=True,
truncation=True,
max_length=max_len,
return_tensors="pt"
).to(device)
# Some models may not need token_type_ids
encoded.pop("token_type_ids", None)
with torch.no_grad():
logits = model(**encoded).logits
probs = torch.softmax(logits, dim=-1).cpu().numpy()
# Extract AI probability (label=1)
ai_prob = float(probs[0][1])
return ai_prob
# === Artifact Detection ===
def has_html_or_ai_artifacts(text: str) -> bool:
"""Detect HTML tags or attributes typical of copy-pasted AI output."""
if not text:
return False
html_pattern = re.compile(r'<[^>]+>')
data_attr_pattern = re.compile(r'data-(start|end)=["\']?\d+')
return bool(html_pattern.search(text) or data_attr_pattern.search(text))
# === Main Prediction Function ===
def analyze_text(text, threshold=0.5, chunk_size=80):
"""
Main function to analyze text and detect AI-generated content
Args:
text (str): Input text to analyze
threshold (float): Confidence threshold (0-1)
Returns:
dict: Analysis results
"""
if not text or not text.strip():
return {
"error": "No text provided",
"overall_type": "Unknown",
"overall_confidence": 0.0,
"overall_score": 0.0
}
try:
# Check for AI artifacts
has_artifacts = has_html_or_ai_artifacts(text)
# Preprocess text
processed_text = preprocess_text_for_detection(text)
if not processed_text:
return {
"error": "Text too short or invalid after preprocessing",
"overall_type": "Unknown",
"overall_confidence": 0.0,
"overall_score": 0.0
}
# Score the text
ai_score = score_text(processed_text)
# Determine overall type and confidence
overall_type = "AI" if ai_score >= threshold else "Human"
overall_confidence = ai_score if overall_type == "AI" else (1 - ai_score)
return {
"overall_type": overall_type,
"overall_confidence": float(overall_confidence),
"overall_score": float(ai_score),
"has_artifacts": has_artifacts
}
except Exception as e:
return {
"error": f"Analysis failed: {str(e)}",
"overall_type": "Error",
"overall_confidence": 0.0,
"overall_score": 0.0
}
# Pre-load model when module is imported (optional)
try:
print("Pre-loading model components...")
get_components()
print("Model pre-loaded successfully!")
except Exception as e:
print(f"Pre-loading failed, will load on first use: {e}") |