Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification | |
| import numpy as np | |
| import re | |
| import os | |
| import time | |
| from pathlib import Path | |
| # Configure cache for Hugging Face Spaces | |
| os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache' | |
| os.environ['HF_HOME'] = '/tmp/huggingface' | |
| # Create cache directories | |
| Path('/tmp/transformers_cache').mkdir(parents=True, exist_ok=True) | |
| Path('/tmp/huggingface').mkdir(parents=True, exist_ok=True) | |
| MODEL_DIR = "abhi099k/ai-text-detector-v-n4.0" | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Initialize as None, load on first use | |
| _tokenizer = None | |
| _config = None | |
| _model = None | |
| def get_components(): | |
| """Lazy load model components with retry logic""" | |
| global _tokenizer, _config, _model | |
| if _tokenizer is None: | |
| max_retries = 3 | |
| for attempt in range(max_retries): | |
| try: | |
| print(f"Loading model components... (Attempt {attempt + 1}/{max_retries})") | |
| _tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_DIR, | |
| cache_dir='/tmp/transformers_cache', | |
| local_files_only=False | |
| ) | |
| _config = AutoConfig.from_pretrained( | |
| MODEL_DIR, | |
| cache_dir='/tmp/transformers_cache', | |
| local_files_only=False | |
| ) | |
| _model = AutoModelForSequenceClassification.from_pretrained( | |
| MODEL_DIR, | |
| config=_config, | |
| cache_dir='/tmp/transformers_cache', | |
| local_files_only=False | |
| ).to(device) | |
| _model.eval() | |
| print("Model loaded successfully!") | |
| break | |
| except OSError as e: | |
| if attempt < max_retries - 1: | |
| wait_time = (attempt + 1) * 2 | |
| print(f"Cache conflict detected, retrying in {wait_time} seconds...") | |
| time.sleep(wait_time) | |
| # Try to clear any lock files | |
| cache_path = Path('/tmp/transformers_cache') | |
| if cache_path.exists(): | |
| for lock_file in cache_path.glob("*.lock"): | |
| try: | |
| lock_file.unlink() | |
| print(f"Removed lock file: {lock_file}") | |
| except: | |
| pass | |
| else: | |
| print(f"Failed to load model after {max_retries} attempts: {e}") | |
| raise | |
| return _tokenizer, _config, _model | |
| # === Preprocessing: Normalize + Flatten === | |
| def preprocess_text_for_detection(text: str) -> str: | |
| """ | |
| Convert structured notes (bullets, lists) into clean sentences for AI detection. | |
| """ | |
| if not text or not isinstance(text, str): | |
| return "" | |
| # Replace bullets / dashes with periods | |
| text = re.sub(r"[\n•\-–]+", ". ", text) | |
| # Remove multiple spaces | |
| text = re.sub(r"\s+", " ", text) | |
| # Ensure consistent punctuation spacing | |
| text = re.sub(r"\s*([,.!?;:])\s*", r"\1 ", text) | |
| return text.strip() | |
| # === Core Scoring === | |
| def score_text(text, max_len=512): | |
| """Return AI probability score (float between 0-1) for the text.""" | |
| tokenizer, config, model = get_components() | |
| encoded = tokenizer( | |
| text, | |
| padding=True, | |
| truncation=True, | |
| max_length=max_len, | |
| return_tensors="pt" | |
| ).to(device) | |
| # Some models may not need token_type_ids | |
| encoded.pop("token_type_ids", None) | |
| with torch.no_grad(): | |
| logits = model(**encoded).logits | |
| probs = torch.softmax(logits, dim=-1).cpu().numpy() | |
| # Extract AI probability (label=1) | |
| ai_prob = float(probs[0][1]) | |
| return ai_prob | |
| # === Artifact Detection === | |
| def has_html_or_ai_artifacts(text: str) -> bool: | |
| """Detect HTML tags or attributes typical of copy-pasted AI output.""" | |
| if not text: | |
| return False | |
| html_pattern = re.compile(r'<[^>]+>') | |
| data_attr_pattern = re.compile(r'data-(start|end)=["\']?\d+') | |
| return bool(html_pattern.search(text) or data_attr_pattern.search(text)) | |
| # === Main Prediction Function === | |
| def analyze_text(text, threshold=0.5, chunk_size=80): | |
| """ | |
| Main function to analyze text and detect AI-generated content | |
| Args: | |
| text (str): Input text to analyze | |
| threshold (float): Confidence threshold (0-1) | |
| Returns: | |
| dict: Analysis results | |
| """ | |
| if not text or not text.strip(): | |
| return { | |
| "error": "No text provided", | |
| "overall_type": "Unknown", | |
| "overall_confidence": 0.0, | |
| "overall_score": 0.0 | |
| } | |
| try: | |
| # Check for AI artifacts | |
| has_artifacts = has_html_or_ai_artifacts(text) | |
| # Preprocess text | |
| processed_text = preprocess_text_for_detection(text) | |
| if not processed_text: | |
| return { | |
| "error": "Text too short or invalid after preprocessing", | |
| "overall_type": "Unknown", | |
| "overall_confidence": 0.0, | |
| "overall_score": 0.0 | |
| } | |
| # Score the text | |
| ai_score = score_text(processed_text) | |
| # Determine overall type and confidence | |
| overall_type = "AI" if ai_score >= threshold else "Human" | |
| overall_confidence = ai_score if overall_type == "AI" else (1 - ai_score) | |
| return { | |
| "overall_type": overall_type, | |
| "overall_confidence": float(overall_confidence), | |
| "overall_score": float(ai_score), | |
| "has_artifacts": has_artifacts | |
| } | |
| except Exception as e: | |
| return { | |
| "error": f"Analysis failed: {str(e)}", | |
| "overall_type": "Error", | |
| "overall_confidence": 0.0, | |
| "overall_score": 0.0 | |
| } | |
| # Pre-load model when module is imported (optional) | |
| try: | |
| print("Pre-loading model components...") | |
| get_components() | |
| print("Model pre-loaded successfully!") | |
| except Exception as e: | |
| print(f"Pre-loading failed, will load on first use: {e}") |