Spaces:

abhi099k
/

Latest-app

Sleeping

File size: 6,313 Bytes

import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
import numpy as np
import re
import os
import time
from pathlib import Path

# Configure cache for Hugging Face Spaces
os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache'
os.environ['HF_HOME'] = '/tmp/huggingface'

# Create cache directories
Path('/tmp/transformers_cache').mkdir(parents=True, exist_ok=True)
Path('/tmp/huggingface').mkdir(parents=True, exist_ok=True)

MODEL_DIR = "abhi099k/ai-text-detector-v-n4.0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize as None, load on first use
_tokenizer = None
_config = None
_model = None

def get_components():
    """Lazy load model components with retry logic"""
    global _tokenizer, _config, _model
    
    if _tokenizer is None:
        max_retries = 3
        for attempt in range(max_retries):
            try:
                print(f"Loading model components... (Attempt {attempt + 1}/{max_retries})")
                _tokenizer = AutoTokenizer.from_pretrained(
                    MODEL_DIR, 
                    cache_dir='/tmp/transformers_cache',
                    local_files_only=False
                )
                _config = AutoConfig.from_pretrained(
                    MODEL_DIR,
                    cache_dir='/tmp/transformers_cache', 
                    local_files_only=False
                )
                _model = AutoModelForSequenceClassification.from_pretrained(
                    MODEL_DIR,
                    config=_config,
                    cache_dir='/tmp/transformers_cache',
                    local_files_only=False
                ).to(device)
                _model.eval()
                print("Model loaded successfully!")
                break
            except OSError as e:
                if attempt < max_retries - 1:
                    wait_time = (attempt + 1) * 2
                    print(f"Cache conflict detected, retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    
                    # Try to clear any lock files
                    cache_path = Path('/tmp/transformers_cache')
                    if cache_path.exists():
                        for lock_file in cache_path.glob("*.lock"):
                            try:
                                lock_file.unlink()
                                print(f"Removed lock file: {lock_file}")
                            except:
                                pass
                else:
                    print(f"Failed to load model after {max_retries} attempts: {e}")
                    raise
    
    return _tokenizer, _config, _model

# === Preprocessing: Normalize + Flatten ===
def preprocess_text_for_detection(text: str) -> str:
    """
    Convert structured notes (bullets, lists) into clean sentences for AI detection.
    """
    if not text or not isinstance(text, str):
        return ""
        
    # Replace bullets / dashes with periods
    text = re.sub(r"[\n•\-–]+", ". ", text)

    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)

    # Ensure consistent punctuation spacing
    text = re.sub(r"\s*([,.!?;:])\s*", r"\1 ", text)

    return text.strip()

# === Core Scoring ===
def score_text(text, max_len=512):
    """Return AI probability score (float between 0-1) for the text."""
    tokenizer, config, model = get_components()
    
    encoded = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    ).to(device)

    # Some models may not need token_type_ids
    encoded.pop("token_type_ids", None)

    with torch.no_grad():
        logits = model(**encoded).logits
        probs = torch.softmax(logits, dim=-1).cpu().numpy()

    # Extract AI probability (label=1)
    ai_prob = float(probs[0][1])
    return ai_prob

# === Artifact Detection ===
def has_html_or_ai_artifacts(text: str) -> bool:
    """Detect HTML tags or attributes typical of copy-pasted AI output."""
    if not text:
        return False
        
    html_pattern = re.compile(r'<[^>]+>')
    data_attr_pattern = re.compile(r'data-(start|end)=["\']?\d+')
    return bool(html_pattern.search(text) or data_attr_pattern.search(text))

# === Main Prediction Function ===
def analyze_text(text, threshold=0.5, chunk_size=80):
    """
    Main function to analyze text and detect AI-generated content
    
    Args:
        text (str): Input text to analyze
        threshold (float): Confidence threshold (0-1)
    
    Returns:
        dict: Analysis results
    """
    if not text or not text.strip():
        return {
            "error": "No text provided",
            "overall_type": "Unknown",
            "overall_confidence": 0.0,
            "overall_score": 0.0
        }
    
    try:
        # Check for AI artifacts
        has_artifacts = has_html_or_ai_artifacts(text)
        
        # Preprocess text
        processed_text = preprocess_text_for_detection(text)
        
        if not processed_text:
            return {
                "error": "Text too short or invalid after preprocessing",
                "overall_type": "Unknown", 
                "overall_confidence": 0.0,
                "overall_score": 0.0
            }
        
        # Score the text
        ai_score = score_text(processed_text)
        
        # Determine overall type and confidence
        overall_type = "AI" if ai_score >= threshold else "Human"
        overall_confidence = ai_score if overall_type == "AI" else (1 - ai_score)
        
        return {
            "overall_type": overall_type,
            "overall_confidence": float(overall_confidence),
            "overall_score": float(ai_score),
            "has_artifacts": has_artifacts
        }
        
    except Exception as e:
        return {
            "error": f"Analysis failed: {str(e)}",
            "overall_type": "Error",
            "overall_confidence": 0.0,
            "overall_score": 0.0
        }

# Pre-load model when module is imported (optional)
try:
    print("Pre-loading model components...")
    get_components()
    print("Model pre-loaded successfully!")
except Exception as e:
    print(f"Pre-loading failed, will load on first use: {e}")