Spaces:

RafzE
/

detextly-ai-detector

Running

File size: 14,663 Bytes

9fd3fe2
 
1583931
a1d03f0
bf58373
 
2c1a8bf
9fd3fe2
f7d6571
9fd3fe2
d9197a8
f7d6571
 
 
 
 
799764f
9fd3fe2
d9197a8
9fd3fe2
 
a2e7efa
f7d6571
9fd3fe2
 
799764f
9fd3fe2
 
 
 
 
 
 
 
1583931
9fd3fe2
 
f7d6571
 
9fd3fe2
f7d6571
1583931
 
 
 
 
 
 
f7d6571
1583931
f7d6571
1583931
 
f7d6571
9fd3fe2
 
 
 
 
 
2c1a8bf
9fd3fe2
1583931
799764f
9fd3fe2
 
 
 
 
 
f7d6571
9fd3fe2
799764f
9fd3fe2
799764f
 
 
 
 
 
ca5755e
f7d6571
ca5755e
 
f7d6571
 
 
 
799764f
d9197a8
f7d6571
 
799764f
 
 
 
f7d6571
1583931
9fd3fe2
 
ca5755e
f7d6571
799764f
 
 
 
bf58373
9fd3fe2
 
799764f
ca5755e
9fd3fe2
799764f
d9197a8
ca5755e
f7d6571
 
 
 
ca5755e
1583931
ca5755e
f7d6571
 
 
 
ca5755e
f7d6571
 
 
 
 
9fd3fe2
 
 
1583931
f7d6571
1583931
799764f
7a4073f
799764f
 
 
 
 
7a4073f
bf58373
f7d6571
 
dc6d9a4
799764f
f7d6571
 
 
 
 
799764f
d9197a8
 
f7d6571
bf58373
 
f7d6571
ca5755e
f7d6571
ca5755e
799764f
 
fd6022c
bf58373
ca5755e
f7d6571
 
 
 
ca5755e
f7d6571
 
 
 
ca5755e
 
bf58373
f7d6571
 
 
 
 
bf58373
ca5755e
f7d6571
bf58373
2c1a8bf
f7d6571
 
ca5755e
f7d6571
ca5755e
f7d6571
 
 
 
 
 
 
ca5755e
f7d6571
 
 
 
 
 
 
 
 
 
 
 
ca5755e
f7d6571
 
ca5755e
f7d6571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca5755e
1583931
9fd3fe2
799764f
1583931
f7d6571
 
 
 
 
 
 
9fd3fe2
 
 
bf58373
 
 
 
f7d6571
 
1583931
f7d6571
 
 
 
1583931
f7d6571
 
 
 
 
 
 
 
1583931
f7d6571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf58373
 
 
f7d6571
1583931
fd6022c
f7d6571
bf58373
f7d6571
 
 
ca5755e
1583931
f7d6571
 
ca5755e
f7d6571
 
ca5755e
f7d6571
 
 
 
 
d9197a8
f7d6571
ca5755e
f7d6571
 
 
 
 
 
 
 
 
 
ca5755e
fd6022c
f7d6571
 
 
fd6022c
f7d6571
 
fd6022c
 
f7d6571
 
 
 
fd6022c
f7d6571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd6022c
f7d6571
 
ca5755e
fd6022c
f7d6571
 
 
fd6022c
f7d6571
 
fd6022c
f7d6571
 
bf58373
 
 
fd6022c
799764f
 
 
 
 
 
bf58373
 
f7d6571
 
 
bf58373
ca5755e
f7d6571
 
 
 
1583931
f7d6571
 
 
 
 
 
 
bf58373
f7d6571
bf58373
 
f7d6571

from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, field_validator, ValidationInfo
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import logging
from typing import Optional, List
import time
import sys

# ---------------- Logging ----------------
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    stream=sys.stdout
)
logger = logging.getLogger("detector")

# ---------------- FastAPI ----------------
app = FastAPI(
    title="Detextly AI Detector API",
    description="AI Detector with chunked scoring and low-confidence filter",
    version="2.1.0"
)

# CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# ---------------- Pydantic Models ----------------
class ScanRequest(BaseModel):
    text: str
    scan_type: Optional[str] = None
    scanType: Optional[str] = None
    userId: Optional[str] = None
    
    @field_validator('scanType')
    @classmethod
    def map_scantype_to_scan_type(cls, v: Optional[str], info: ValidationInfo) -> Optional[str]:
        """Mapper to ensure backward compatibility with old 'scanType' parameter name."""
        if v is not None:
            # Map the old 'scanType' field value to the new 'scan_type' field
            info.data['scan_type'] = v
        return v

    def get_scan_type(self) -> str:
        """Get the scan type, defaulting to 'basic' if not provided."""
        # scan_type takes precedence as it's the canonical field name
        return self.scan_type or "basic"

class ScanResponse(BaseModel):
    success: bool
    result: dict
    processingTime: int
    credits: Optional[dict] = None
    test_mode: bool = False

# ---------------- AI Detector Core ----------------
MODEL_NAME = "openai-community/roberta-large-openai-detector"

class AIDetector:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.label_map = None
        logger.info(f"Using device: {self.device}")

    def load_model(self):
        if self.model is not None:
            return
        logger.info(f"Loading model: {MODEL_NAME}")
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
            self.model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
            
            # Store label mapping for debugging
            if hasattr(self.model.config, 'id2label'):
                self.label_map = self.model.config.id2label
                logger.info(f"Model label mapping: {self.label_map}")
            else:
                logger.warning("No label mapping found in model config")
                
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise RuntimeError(f"Failed to load model: {e}")
        
        self.model.to(self.device)
        self.model.eval()
        logger.info("Model loaded successfully.")

    def predict(self, text: str, max_length: int = 512) -> dict:
        """Return both human and AI probabilities."""
        if self.model is None:
            self.load_model()
        
        # Tokenize input
        tokens = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=max_length,
            padding=True
        )
        tokens = {k: v.to(self.device) for k, v in tokens.items()}
        
        with torch.no_grad():
            outputs = self.model(**tokens)
            probs = torch.softmax(outputs.logits, dim=-1)
            
            # Get probabilities for both classes
            human_prob = float(probs[0][0].item())  # Class 0
            ai_prob = float(probs[0][1].item())     # Class 1
            
            # Debug logging
            logger.debug(f"Class 0 (Human): {human_prob:.4f}, Class 1 (AI): {ai_prob:.4f}")
            
            # Verify probabilities sum to ~1.0
            total = human_prob + ai_prob
            if abs(total - 1.0) > 0.01:
                logger.warning(f"Probabilities don't sum to 1.0: {total:.4f}")
            
            return {
                "human_probability": human_prob,
                "ai_probability": ai_prob,
                "raw_probs": probs.tolist()
            }

detector = AIDetector()

# ---------------- Pattern Detection ----------------
def detect_chatgpt_patterns(text: str) -> bool:
    """Return True if ChatGPT patterns are detected."""
    patterns = [
        "as an ai language model",
        "i am an ai model",
        "i cannot provide medical",
        "as a language model",
        "based on the information provided",
        "my training data",
        "i don't have personal experiences",
        "i don't have feelings",
        "as an artificial intelligence",
        "i don't have personal opinions"
    ]
    lower = text.lower()
    for pattern in patterns:
        if pattern in lower:
            logger.debug(f"ChatGPT pattern detected: {pattern}")
            return True
    return False

# ---------------- Highlight / Chunked Scan ----------------
def analyze_sections(text: str, chunk_size: int = 40) -> List[dict]:
    """Split text into smaller chunks and compute AI probability for each."""
    sections = []
    words = text.split()
    total_chunks = (len(words) + chunk_size - 1) // chunk_size
    
    logger.info(f"Analyzing {len(words)} words in {total_chunks} chunks")
    
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        if len(chunk.strip()) < 20:
            continue
        
        # Get probabilities from model
        probs = detector.predict(chunk)
        human_prob = probs["human_probability"]
        ai_prob = probs["ai_probability"]
        
        # Check for ChatGPT patterns
        has_pattern = detect_chatgpt_patterns(chunk)
        if has_pattern:
            ai_prob = max(ai_prob, 0.9)  # Boost AI probability if pattern found
            human_prob = 1 - ai_prob
        
        sections.append({
            "text": chunk[:200] + "..." if len(chunk) > 200 else chunk,
            "ai_probability": round(ai_prob, 4),
            "human_probability": round(human_prob, 4),
            "words": len(chunk.split()),
            "has_chatgpt_pattern": has_pattern
        })
    
    logger.info(f"Generated {len(sections)} sections for analysis")
    return sections

def compute_overall_score(sections: List[dict], confidence_threshold: float = 0.3) -> dict:
    """Compute weighted average probabilities with confidence filtering."""
    if not sections:
        return {"ai_probability": 0.0, "human_probability": 1.0, "confidence": "low"}
    
    # Filter out low-confidence predictions (close to 0.5)
    confident_sections = []
    for section in sections:
        ai_prob = section["ai_probability"]
        confidence = abs(ai_prob - 0.5)  # Distance from uncertain (0.5)
        if confidence >= confidence_threshold:
            confident_sections.append(section)
    
    if not confident_sections:
        # If no confident sections, use all sections
        confident_sections = sections
    
    # Weighted average by word count
    total_words = sum(s["words"] for s in confident_sections)
    
    if total_words == 0:
        return {"ai_probability": 0.5, "human_probability": 0.5, "confidence": "low"}
    
    weighted_ai_sum = sum(s["ai_probability"] * s["words"] for s in confident_sections)
    weighted_human_sum = sum(s["human_probability"] * s["words"] for s in confident_sections)
    
    overall_ai = weighted_ai_sum / total_words
    overall_human = weighted_human_sum / total_words
    
    # Determine confidence level
    distance_from_mid = abs(overall_ai - 0.5)
    if distance_from_mid > 0.4:
        confidence_level = "high"
    elif distance_from_mid > 0.2:
        confidence_level = "medium"
    else:
        confidence_level = "low"
    
    return {
        "ai_probability": round(overall_ai, 4),
        "human_probability": round(overall_human, 4),
        "confidence": confidence_level,
        "sections_analyzed": len(sections),
        "confident_sections": len(confident_sections)
    }

# ---------------- API Endpoints ----------------
@app.on_event("startup")
async def startup():
    """Initialize the model on startup."""
    logger.info("Starting Detextly AI Detector API...")
    try:
        detector.load_model()
        logger.info("API startup complete")
    except Exception as e:
        logger.error(f"Failed to start API: {e}")
        raise

@app.get("/")
async def root():
    return {
        "status": "online",
        "model": MODEL_NAME,
        "device": str(detector.device),
        "version": "2.1.0",
        "features": ["basic_scan", "highlight_scan", "chatgpt_pattern_detection"],
        "note": "Accepts both 'scan_type' and 'scanType' parameters"
    }

@app.get("/health")
async def health():
    return {
        "status": "healthy",
        "model_loaded": detector.model is not None,
        "model": MODEL_NAME,
        "timestamp": time.time()
    }

@app.get("/debug/test")
async def debug_test():
    """Test endpoint to verify model is working correctly."""
    test_texts = [
        "I went to the store yesterday to buy groceries.",
        "As an AI language model, I don't have personal experiences.",
        "The quick brown fox jumps over the lazy dog."
    ]
    
    results = []
    for text in test_texts:
        probs = detector.predict(text)
        results.append({
            "text": text[:50] + "..." if len(text) > 50 else text,
            "human_probability": probs["human_probability"],
            "ai_probability": probs["ai_probability"]
        })
    
    return {
        "test_results": results,
        "model_info": {
            "name": MODEL_NAME,
            "labels": detector.label_map,
            "device": str(detector.device)
        }
    }

@app.post("/api/scan", response_model=ScanResponse)
async def scan_text(request: ScanRequest):
    """Main scanning endpoint."""
    start_time = time.time()
    
    try:
        # Validate input
        if not request.text or len(request.text.strip()) < 10:
            raise HTTPException(status_code=400, detail="Text must be at least 10 characters long.")
        
        # Get scan type (handles both scan_type and scanType via the validator)
        scan_type = request.get_scan_type()
        logger.info(f"Scan request: type={scan_type}, userId={request.userId}, text_length={len(request.text)}")
        
        # Limit text length for performance
        text = request.text[:5000]
        
        # Check for ChatGPT patterns
        chatgpt_detected = detect_chatgpt_patterns(text)
        
        if scan_type == "highlight":
            # Chunked analysis
            sections = analyze_sections(text, chunk_size=40)
            overall = compute_overall_score(sections)
            
            # Identify AI-heavy sections
            ai_sections = [
                {
                    "text": s["text"],
                    "ai_probability": s["ai_probability"],
                    "human_probability": s["human_probability"],
                    "words": s["words"]
                }
                for s in sections if s["ai_probability"] > 0.6
            ]
            
            result = {
                "overall": overall["human_probability"],  # Human probability for backward compatibility
                "ai_probability": overall["ai_probability"],
                "human_probability": overall["human_probability"],
                "model": MODEL_NAME,
                "confidence": overall["confidence"],
                "chatgpt_detected": chatgpt_detected,
                "scan_type": "highlight",
                "section_count": len(sections),
                "ai_section_count": len(ai_sections),
                "sections_analyzed": overall["sections_analyzed"],
                "confident_sections": overall["confident_sections"],
                "ai_sections": ai_sections[:10]  # Limit to first 10
            }
            
        else:
            # Basic scan (single analysis)
            probs = detector.predict(text)
            human_prob = probs["human_probability"]
            ai_prob = probs["ai_probability"]
            
            # Boost AI probability if ChatGPT patterns detected
            if chatgpt_detected:
                ai_prob = max(ai_prob, 0.9)
                human_prob = 1 - ai_prob
            
            # Determine confidence
            distance_from_mid = abs(ai_prob - 0.5)
            confidence = "high" if distance_from_mid > 0.4 else "medium" if distance_from_mid > 0.2 else "low"
            
            result = {
                "overall": human_prob,  # Human probability for backward compatibility
                "ai_probability": ai_prob,
                "human_probability": human_prob,
                "model": MODEL_NAME,
                "confidence": confidence,
                "chatgpt_detected": chatgpt_detected,
                "scan_type": "basic"
            }
        
        # Calculate processing time
        processing_time = int((time.time() - start_time) * 1000)
        logger.info(f"Scan completed in {processing_time}ms: AI={result.get('ai_probability', 0):.2%}")
        
        return ScanResponse(
            success=True,
            result=result,
            processingTime=processing_time,
            credits={
                "basic": 5,
                "highlight": 1,
                "resetTime": "2024-12-31T23:59:59Z",
                "test_mode": False
            },
            test_mode=False
        )
        
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Scan error: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")

@app.get("/api/credits")
async def get_credits(userId: Optional[str] = None):
    """Get credits information (for compatibility with worker)."""
    return {
        "basic": 5,
        "highlight": 1,
        "resetTime": "2024-12-31T23:59:59Z",
        "test_mode": False,
        "userId": userId or "unknown"
    }

# ---------------- Main Entry Point ----------------
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(
        app,
        host="0.0.0.0",
        port=7860,
        log_level="info",
        access_log=True
    )