File size: 6,313 Bytes
8e274e9
 
 
 
25121a7
 
 
8e274e9
25121a7
 
 
 
 
 
 
 
 
8e274e9
 
25121a7
 
 
 
8e274e9
25121a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e274e9
 
 
 
 
 
25121a7
 
 
8e274e9
 
 
 
 
 
 
 
 
 
 
 
09abcdc
 
25121a7
 
8e274e9
09abcdc
8e274e9
 
 
 
 
 
 
 
 
 
09abcdc
8e274e9
 
 
09abcdc
 
8e274e9
 
 
 
25121a7
 
 
8e274e9
 
 
25121a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09abcdc
25121a7
 
 
 
 
 
09abcdc
 
25121a7
09abcdc
25121a7
09abcdc
 
 
 
25121a7
 
09abcdc
 
25121a7
09abcdc
 
 
25121a7
 
 
 
09abcdc
 
25121a7
 
 
 
 
 
 
09abcdc
25121a7
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
import numpy as np
import re
import os
import time
from pathlib import Path

# Configure cache for Hugging Face Spaces
os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache'
os.environ['HF_HOME'] = '/tmp/huggingface'

# Create cache directories
Path('/tmp/transformers_cache').mkdir(parents=True, exist_ok=True)
Path('/tmp/huggingface').mkdir(parents=True, exist_ok=True)

MODEL_DIR = "abhi099k/ai-text-detector-v-n4.0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize as None, load on first use
_tokenizer = None
_config = None
_model = None

def get_components():
    """Lazy load model components with retry logic"""
    global _tokenizer, _config, _model
    
    if _tokenizer is None:
        max_retries = 3
        for attempt in range(max_retries):
            try:
                print(f"Loading model components... (Attempt {attempt + 1}/{max_retries})")
                _tokenizer = AutoTokenizer.from_pretrained(
                    MODEL_DIR, 
                    cache_dir='/tmp/transformers_cache',
                    local_files_only=False
                )
                _config = AutoConfig.from_pretrained(
                    MODEL_DIR,
                    cache_dir='/tmp/transformers_cache', 
                    local_files_only=False
                )
                _model = AutoModelForSequenceClassification.from_pretrained(
                    MODEL_DIR,
                    config=_config,
                    cache_dir='/tmp/transformers_cache',
                    local_files_only=False
                ).to(device)
                _model.eval()
                print("Model loaded successfully!")
                break
            except OSError as e:
                if attempt < max_retries - 1:
                    wait_time = (attempt + 1) * 2
                    print(f"Cache conflict detected, retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    
                    # Try to clear any lock files
                    cache_path = Path('/tmp/transformers_cache')
                    if cache_path.exists():
                        for lock_file in cache_path.glob("*.lock"):
                            try:
                                lock_file.unlink()
                                print(f"Removed lock file: {lock_file}")
                            except:
                                pass
                else:
                    print(f"Failed to load model after {max_retries} attempts: {e}")
                    raise
    
    return _tokenizer, _config, _model

# === Preprocessing: Normalize + Flatten ===
def preprocess_text_for_detection(text: str) -> str:
    """
    Convert structured notes (bullets, lists) into clean sentences for AI detection.
    """
    if not text or not isinstance(text, str):
        return ""
        
    # Replace bullets / dashes with periods
    text = re.sub(r"[\n•\-–]+", ". ", text)

    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)

    # Ensure consistent punctuation spacing
    text = re.sub(r"\s*([,.!?;:])\s*", r"\1 ", text)

    return text.strip()

# === Core Scoring ===
def score_text(text, max_len=512):
    """Return AI probability score (float between 0-1) for the text."""
    tokenizer, config, model = get_components()
    
    encoded = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    ).to(device)

    # Some models may not need token_type_ids
    encoded.pop("token_type_ids", None)

    with torch.no_grad():
        logits = model(**encoded).logits
        probs = torch.softmax(logits, dim=-1).cpu().numpy()

    # Extract AI probability (label=1)
    ai_prob = float(probs[0][1])
    return ai_prob

# === Artifact Detection ===
def has_html_or_ai_artifacts(text: str) -> bool:
    """Detect HTML tags or attributes typical of copy-pasted AI output."""
    if not text:
        return False
        
    html_pattern = re.compile(r'<[^>]+>')
    data_attr_pattern = re.compile(r'data-(start|end)=["\']?\d+')
    return bool(html_pattern.search(text) or data_attr_pattern.search(text))

# === Main Prediction Function ===
def analyze_text(text, threshold=0.5, chunk_size=80):
    """
    Main function to analyze text and detect AI-generated content
    
    Args:
        text (str): Input text to analyze
        threshold (float): Confidence threshold (0-1)
    
    Returns:
        dict: Analysis results
    """
    if not text or not text.strip():
        return {
            "error": "No text provided",
            "overall_type": "Unknown",
            "overall_confidence": 0.0,
            "overall_score": 0.0
        }
    
    try:
        # Check for AI artifacts
        has_artifacts = has_html_or_ai_artifacts(text)
        
        # Preprocess text
        processed_text = preprocess_text_for_detection(text)
        
        if not processed_text:
            return {
                "error": "Text too short or invalid after preprocessing",
                "overall_type": "Unknown", 
                "overall_confidence": 0.0,
                "overall_score": 0.0
            }
        
        # Score the text
        ai_score = score_text(processed_text)
        
        # Determine overall type and confidence
        overall_type = "AI" if ai_score >= threshold else "Human"
        overall_confidence = ai_score if overall_type == "AI" else (1 - ai_score)
        
        return {
            "overall_type": overall_type,
            "overall_confidence": float(overall_confidence),
            "overall_score": float(ai_score),
            "has_artifacts": has_artifacts
        }
        
    except Exception as e:
        return {
            "error": f"Analysis failed: {str(e)}",
            "overall_type": "Error",
            "overall_confidence": 0.0,
            "overall_score": 0.0
        }

# Pre-load model when module is imported (optional)
try:
    print("Pre-loading model components...")
    get_components()
    print("Model pre-loaded successfully!")
except Exception as e:
    print(f"Pre-loading failed, will load on first use: {e}")