File size: 14,663 Bytes
9fd3fe2
 
1583931
a1d03f0
bf58373
 
2c1a8bf
9fd3fe2
f7d6571
9fd3fe2
d9197a8
f7d6571
 
 
 
 
799764f
9fd3fe2
d9197a8
9fd3fe2
 
a2e7efa
f7d6571
9fd3fe2
 
799764f
9fd3fe2
 
 
 
 
 
 
 
1583931
9fd3fe2
 
f7d6571
 
9fd3fe2
f7d6571
1583931
 
 
 
 
 
 
f7d6571
1583931
f7d6571
1583931
 
f7d6571
9fd3fe2
 
 
 
 
 
2c1a8bf
9fd3fe2
1583931
799764f
9fd3fe2
 
 
 
 
 
f7d6571
9fd3fe2
799764f
9fd3fe2
799764f
 
 
 
 
 
ca5755e
f7d6571
ca5755e
 
f7d6571
 
 
 
799764f
d9197a8
f7d6571
 
799764f
 
 
 
f7d6571
1583931
9fd3fe2
 
ca5755e
f7d6571
799764f
 
 
 
bf58373
9fd3fe2
 
799764f
ca5755e
9fd3fe2
799764f
d9197a8
ca5755e
f7d6571
 
 
 
ca5755e
1583931
ca5755e
f7d6571
 
 
 
ca5755e
f7d6571
 
 
 
 
9fd3fe2
 
 
1583931
f7d6571
1583931
799764f
7a4073f
799764f
 
 
 
 
7a4073f
bf58373
f7d6571
 
dc6d9a4
799764f
f7d6571
 
 
 
 
799764f
d9197a8
 
f7d6571
bf58373
 
f7d6571
ca5755e
f7d6571
ca5755e
799764f
 
fd6022c
bf58373
ca5755e
f7d6571
 
 
 
ca5755e
f7d6571
 
 
 
ca5755e
 
bf58373
f7d6571
 
 
 
 
bf58373
ca5755e
f7d6571
bf58373
2c1a8bf
f7d6571
 
ca5755e
f7d6571
ca5755e
f7d6571
 
 
 
 
 
 
ca5755e
f7d6571
 
 
 
 
 
 
 
 
 
 
 
ca5755e
f7d6571
 
ca5755e
f7d6571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca5755e
1583931
9fd3fe2
799764f
1583931
f7d6571
 
 
 
 
 
 
9fd3fe2
 
 
bf58373
 
 
 
f7d6571
 
1583931
f7d6571
 
 
 
1583931
f7d6571
 
 
 
 
 
 
 
1583931
f7d6571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf58373
 
 
f7d6571
1583931
fd6022c
f7d6571
bf58373
f7d6571
 
 
ca5755e
1583931
f7d6571
 
ca5755e
f7d6571
 
ca5755e
f7d6571
 
 
 
 
d9197a8
f7d6571
ca5755e
f7d6571
 
 
 
 
 
 
 
 
 
ca5755e
fd6022c
f7d6571
 
 
fd6022c
f7d6571
 
fd6022c
 
f7d6571
 
 
 
fd6022c
f7d6571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd6022c
f7d6571
 
ca5755e
fd6022c
f7d6571
 
 
fd6022c
f7d6571
 
fd6022c
f7d6571
 
bf58373
 
 
fd6022c
799764f
 
 
 
 
 
bf58373
 
f7d6571
 
 
bf58373
ca5755e
f7d6571
 
 
 
1583931
f7d6571
 
 
 
 
 
 
bf58373
f7d6571
bf58373
 
f7d6571
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, field_validator, ValidationInfo
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import logging
from typing import Optional, List
import time
import sys

# ---------------- Logging ----------------
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    stream=sys.stdout
)
logger = logging.getLogger("detector")

# ---------------- FastAPI ----------------
app = FastAPI(
    title="Detextly AI Detector API",
    description="AI Detector with chunked scoring and low-confidence filter",
    version="2.1.0"
)

# CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# ---------------- Pydantic Models ----------------
class ScanRequest(BaseModel):
    text: str
    scan_type: Optional[str] = None
    scanType: Optional[str] = None
    userId: Optional[str] = None
    
    @field_validator('scanType')
    @classmethod
    def map_scantype_to_scan_type(cls, v: Optional[str], info: ValidationInfo) -> Optional[str]:
        """Mapper to ensure backward compatibility with old 'scanType' parameter name."""
        if v is not None:
            # Map the old 'scanType' field value to the new 'scan_type' field
            info.data['scan_type'] = v
        return v

    def get_scan_type(self) -> str:
        """Get the scan type, defaulting to 'basic' if not provided."""
        # scan_type takes precedence as it's the canonical field name
        return self.scan_type or "basic"

class ScanResponse(BaseModel):
    success: bool
    result: dict
    processingTime: int
    credits: Optional[dict] = None
    test_mode: bool = False

# ---------------- AI Detector Core ----------------
MODEL_NAME = "openai-community/roberta-large-openai-detector"

class AIDetector:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.label_map = None
        logger.info(f"Using device: {self.device}")

    def load_model(self):
        if self.model is not None:
            return
        logger.info(f"Loading model: {MODEL_NAME}")
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
            self.model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
            
            # Store label mapping for debugging
            if hasattr(self.model.config, 'id2label'):
                self.label_map = self.model.config.id2label
                logger.info(f"Model label mapping: {self.label_map}")
            else:
                logger.warning("No label mapping found in model config")
                
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise RuntimeError(f"Failed to load model: {e}")
        
        self.model.to(self.device)
        self.model.eval()
        logger.info("Model loaded successfully.")

    def predict(self, text: str, max_length: int = 512) -> dict:
        """Return both human and AI probabilities."""
        if self.model is None:
            self.load_model()
        
        # Tokenize input
        tokens = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=max_length,
            padding=True
        )
        tokens = {k: v.to(self.device) for k, v in tokens.items()}
        
        with torch.no_grad():
            outputs = self.model(**tokens)
            probs = torch.softmax(outputs.logits, dim=-1)
            
            # Get probabilities for both classes
            human_prob = float(probs[0][0].item())  # Class 0
            ai_prob = float(probs[0][1].item())     # Class 1
            
            # Debug logging
            logger.debug(f"Class 0 (Human): {human_prob:.4f}, Class 1 (AI): {ai_prob:.4f}")
            
            # Verify probabilities sum to ~1.0
            total = human_prob + ai_prob
            if abs(total - 1.0) > 0.01:
                logger.warning(f"Probabilities don't sum to 1.0: {total:.4f}")
            
            return {
                "human_probability": human_prob,
                "ai_probability": ai_prob,
                "raw_probs": probs.tolist()
            }

detector = AIDetector()

# ---------------- Pattern Detection ----------------
def detect_chatgpt_patterns(text: str) -> bool:
    """Return True if ChatGPT patterns are detected."""
    patterns = [
        "as an ai language model",
        "i am an ai model",
        "i cannot provide medical",
        "as a language model",
        "based on the information provided",
        "my training data",
        "i don't have personal experiences",
        "i don't have feelings",
        "as an artificial intelligence",
        "i don't have personal opinions"
    ]
    lower = text.lower()
    for pattern in patterns:
        if pattern in lower:
            logger.debug(f"ChatGPT pattern detected: {pattern}")
            return True
    return False

# ---------------- Highlight / Chunked Scan ----------------
def analyze_sections(text: str, chunk_size: int = 40) -> List[dict]:
    """Split text into smaller chunks and compute AI probability for each."""
    sections = []
    words = text.split()
    total_chunks = (len(words) + chunk_size - 1) // chunk_size
    
    logger.info(f"Analyzing {len(words)} words in {total_chunks} chunks")
    
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        if len(chunk.strip()) < 20:
            continue
        
        # Get probabilities from model
        probs = detector.predict(chunk)
        human_prob = probs["human_probability"]
        ai_prob = probs["ai_probability"]
        
        # Check for ChatGPT patterns
        has_pattern = detect_chatgpt_patterns(chunk)
        if has_pattern:
            ai_prob = max(ai_prob, 0.9)  # Boost AI probability if pattern found
            human_prob = 1 - ai_prob
        
        sections.append({
            "text": chunk[:200] + "..." if len(chunk) > 200 else chunk,
            "ai_probability": round(ai_prob, 4),
            "human_probability": round(human_prob, 4),
            "words": len(chunk.split()),
            "has_chatgpt_pattern": has_pattern
        })
    
    logger.info(f"Generated {len(sections)} sections for analysis")
    return sections

def compute_overall_score(sections: List[dict], confidence_threshold: float = 0.3) -> dict:
    """Compute weighted average probabilities with confidence filtering."""
    if not sections:
        return {"ai_probability": 0.0, "human_probability": 1.0, "confidence": "low"}
    
    # Filter out low-confidence predictions (close to 0.5)
    confident_sections = []
    for section in sections:
        ai_prob = section["ai_probability"]
        confidence = abs(ai_prob - 0.5)  # Distance from uncertain (0.5)
        if confidence >= confidence_threshold:
            confident_sections.append(section)
    
    if not confident_sections:
        # If no confident sections, use all sections
        confident_sections = sections
    
    # Weighted average by word count
    total_words = sum(s["words"] for s in confident_sections)
    
    if total_words == 0:
        return {"ai_probability": 0.5, "human_probability": 0.5, "confidence": "low"}
    
    weighted_ai_sum = sum(s["ai_probability"] * s["words"] for s in confident_sections)
    weighted_human_sum = sum(s["human_probability"] * s["words"] for s in confident_sections)
    
    overall_ai = weighted_ai_sum / total_words
    overall_human = weighted_human_sum / total_words
    
    # Determine confidence level
    distance_from_mid = abs(overall_ai - 0.5)
    if distance_from_mid > 0.4:
        confidence_level = "high"
    elif distance_from_mid > 0.2:
        confidence_level = "medium"
    else:
        confidence_level = "low"
    
    return {
        "ai_probability": round(overall_ai, 4),
        "human_probability": round(overall_human, 4),
        "confidence": confidence_level,
        "sections_analyzed": len(sections),
        "confident_sections": len(confident_sections)
    }

# ---------------- API Endpoints ----------------
@app.on_event("startup")
async def startup():
    """Initialize the model on startup."""
    logger.info("Starting Detextly AI Detector API...")
    try:
        detector.load_model()
        logger.info("API startup complete")
    except Exception as e:
        logger.error(f"Failed to start API: {e}")
        raise

@app.get("/")
async def root():
    return {
        "status": "online",
        "model": MODEL_NAME,
        "device": str(detector.device),
        "version": "2.1.0",
        "features": ["basic_scan", "highlight_scan", "chatgpt_pattern_detection"],
        "note": "Accepts both 'scan_type' and 'scanType' parameters"
    }

@app.get("/health")
async def health():
    return {
        "status": "healthy",
        "model_loaded": detector.model is not None,
        "model": MODEL_NAME,
        "timestamp": time.time()
    }

@app.get("/debug/test")
async def debug_test():
    """Test endpoint to verify model is working correctly."""
    test_texts = [
        "I went to the store yesterday to buy groceries.",
        "As an AI language model, I don't have personal experiences.",
        "The quick brown fox jumps over the lazy dog."
    ]
    
    results = []
    for text in test_texts:
        probs = detector.predict(text)
        results.append({
            "text": text[:50] + "..." if len(text) > 50 else text,
            "human_probability": probs["human_probability"],
            "ai_probability": probs["ai_probability"]
        })
    
    return {
        "test_results": results,
        "model_info": {
            "name": MODEL_NAME,
            "labels": detector.label_map,
            "device": str(detector.device)
        }
    }

@app.post("/api/scan", response_model=ScanResponse)
async def scan_text(request: ScanRequest):
    """Main scanning endpoint."""
    start_time = time.time()
    
    try:
        # Validate input
        if not request.text or len(request.text.strip()) < 10:
            raise HTTPException(status_code=400, detail="Text must be at least 10 characters long.")
        
        # Get scan type (handles both scan_type and scanType via the validator)
        scan_type = request.get_scan_type()
        logger.info(f"Scan request: type={scan_type}, userId={request.userId}, text_length={len(request.text)}")
        
        # Limit text length for performance
        text = request.text[:5000]
        
        # Check for ChatGPT patterns
        chatgpt_detected = detect_chatgpt_patterns(text)
        
        if scan_type == "highlight":
            # Chunked analysis
            sections = analyze_sections(text, chunk_size=40)
            overall = compute_overall_score(sections)
            
            # Identify AI-heavy sections
            ai_sections = [
                {
                    "text": s["text"],
                    "ai_probability": s["ai_probability"],
                    "human_probability": s["human_probability"],
                    "words": s["words"]
                }
                for s in sections if s["ai_probability"] > 0.6
            ]
            
            result = {
                "overall": overall["human_probability"],  # Human probability for backward compatibility
                "ai_probability": overall["ai_probability"],
                "human_probability": overall["human_probability"],
                "model": MODEL_NAME,
                "confidence": overall["confidence"],
                "chatgpt_detected": chatgpt_detected,
                "scan_type": "highlight",
                "section_count": len(sections),
                "ai_section_count": len(ai_sections),
                "sections_analyzed": overall["sections_analyzed"],
                "confident_sections": overall["confident_sections"],
                "ai_sections": ai_sections[:10]  # Limit to first 10
            }
            
        else:
            # Basic scan (single analysis)
            probs = detector.predict(text)
            human_prob = probs["human_probability"]
            ai_prob = probs["ai_probability"]
            
            # Boost AI probability if ChatGPT patterns detected
            if chatgpt_detected:
                ai_prob = max(ai_prob, 0.9)
                human_prob = 1 - ai_prob
            
            # Determine confidence
            distance_from_mid = abs(ai_prob - 0.5)
            confidence = "high" if distance_from_mid > 0.4 else "medium" if distance_from_mid > 0.2 else "low"
            
            result = {
                "overall": human_prob,  # Human probability for backward compatibility
                "ai_probability": ai_prob,
                "human_probability": human_prob,
                "model": MODEL_NAME,
                "confidence": confidence,
                "chatgpt_detected": chatgpt_detected,
                "scan_type": "basic"
            }
        
        # Calculate processing time
        processing_time = int((time.time() - start_time) * 1000)
        logger.info(f"Scan completed in {processing_time}ms: AI={result.get('ai_probability', 0):.2%}")
        
        return ScanResponse(
            success=True,
            result=result,
            processingTime=processing_time,
            credits={
                "basic": 5,
                "highlight": 1,
                "resetTime": "2024-12-31T23:59:59Z",
                "test_mode": False
            },
            test_mode=False
        )
        
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Scan error: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")

@app.get("/api/credits")
async def get_credits(userId: Optional[str] = None):
    """Get credits information (for compatibility with worker)."""
    return {
        "basic": 5,
        "highlight": 1,
        "resetTime": "2024-12-31T23:59:59Z",
        "test_mode": False,
        "userId": userId or "unknown"
    }

# ---------------- Main Entry Point ----------------
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(
        app,
        host="0.0.0.0",
        port=7860,
        log_level="info",
        access_log=True
    )