Spaces:

RafzE
/

detextly-ai-detector

Running

App Files Files Community

RafzE commited on 3 days ago

Commit

f7d6571

verified ·

1 Parent(s): ca5755e

Update app.py

Browse files

Files changed (1) hide show

app.py +246 -105

app.py CHANGED Viewed

@@ -1,21 +1,26 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
 import logging
 from typing import Optional, List
 import time
 # ---------------- Logging ----------------
-logging.basicConfig(level=logging.DEBUG)  # Changed to DEBUG
 logger = logging.getLogger("detector")
 # ---------------- FastAPI ----------------
 app = FastAPI(
     title="Detextly AI Detector API",
     description="AI Detector with chunked scoring and low-confidence filter",
-    version="2.0.1"  # Updated version
 )
 # CORS
@@ -30,8 +35,21 @@ app.add_middleware(
 # ---------------- Models ----------------
 class ScanRequest(BaseModel):
     text: str
-    scan_type: str = "basic"
     userId: Optional[str] = None
 class ScanResponse(BaseModel):
     success: bool
@@ -48,8 +66,8 @@ class AIDetector:
         self.model = None
         self.tokenizer = None
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logger.info(f"Using device: {self.device}")
-        self.label_map = None  # Store label mapping
     def load_model(self):
         if self.model is not None:
@@ -59,27 +77,27 @@ class AIDetector:
             self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
             self.model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
-            # Check model configuration
-            logger.info(f"Model config: {self.model.config}")
-            logger.info(f"Model has {self.model.config.num_labels} labels")
-            # Try to get label mapping
             if hasattr(self.model.config, 'id2label'):
                 self.label_map = self.model.config.id2label
-                logger.info(f"Label mapping: {self.label_map}")
         except Exception as e:
             logger.error(f"Error loading model: {e}")
-            raise
         self.model.to(self.device)
         self.model.eval()
         logger.info("Model loaded successfully.")
-    def predict(self, text: str, max_length: int = 512) -> float:
-        """Return HUMAN probability"""
         if self.model is None:
             self.load_model()
         tokens = self.tokenizer(
             text,
             return_tensors="pt",
@@ -91,34 +109,33 @@ class AIDetector:
         with torch.no_grad():
             outputs = self.model(**tokens)
-            # Get all probabilities
             probs = torch.softmax(outputs.logits, dim=-1)
             # Debug logging
             logger.debug(f"Raw probabilities: {probs}")
-            # Check label mapping to understand what each class means
-            if self.label_map:
-                for i, prob in enumerate(probs[0]):
-                    logger.debug(f"Class {i} ({self.label_map.get(i, f'Unknown {i}')}): {prob:.4f}")
-            # IMPORTANT: Based on typical AI detectors:
-            # Class 0 = Human, Class 1 = AI
-            # But let's verify by testing with known text
-            # Return HUMAN probability (class 0)
-            human_prob = float(probs[0][0].item())
-            ai_prob = float(probs[0][1].item())
-            logger.info(f"Human: {human_prob:.2%}, AI: {ai_prob:.2%}")
-            return human_prob  # Return human probability
 detector = AIDetector()
 # ---------------- ChatGPT Pattern Detection ----------------
-def detect_chatgpt_patterns(text: str) -> float:
     patterns = [
         "as an ai language model",
         "i am an ai model",
@@ -128,74 +145,109 @@ def detect_chatgpt_patterns(text: str) -> float:
         "my training data",
         "i don't have personal experiences",
         "i don't have feelings",
     ]
     lower = text.lower()
-    found = any(p in lower for p in patterns)
-    return 0.95 if found else 0.0
 # ---------------- Highlight / Chunked Scan ----------------
 def analyze_sections(text: str, chunk_size: int = 40) -> List[dict]:
-    """Split text into smaller chunks and compute probabilities for each."""
     sections = []
     words = text.split()
-    logger.info(f"Analyzing {len(words)} words in {len(words)//chunk_size + 1} chunks")
     for i in range(0, len(words), chunk_size):
         chunk = " ".join(words[i:i+chunk_size])
         if len(chunk.strip()) < 20:
             continue
-        # Get HUMAN probability
-        human_prob = detector.predict(chunk)
-        ai_prob = 1 - human_prob
-        pattern_score = detect_chatgpt_patterns(chunk)
-        if pattern_score > 0:
-            ai_prob = max(ai_prob, pattern_score)
             human_prob = 1 - ai_prob
         sections.append({
-            "text": chunk[:150] + "..." if len(chunk) > 150 else chunk,
-            "ai_probability": ai_prob,
-            "human_probability": human_prob,
-            "words": len(chunk.split())
         })
-        logger.debug(f"Chunk {len(sections)}: AI={ai_prob:.2%}, Human={human_prob:.2%}")
     return sections
-def compute_overall_score(sections: List[dict]) -> float:
-    """Weighted average of section AI probabilities"""
     if not sections:
-        return 0.0
-    total_words = sum(s["words"] for s in sections)
-    weighted_sum = sum(s["ai_probability"] * s["words"] for s in sections)
-    return weighted_sum / total_words
-# ---------------- Test with Sample Text ----------------
-def test_model():
-    """Test the model with known human and AI text samples"""
-    test_human = "I went to the store yesterday to buy some groceries. The weather was nice so I walked. I bought apples, bread, and milk."
-    test_ai = "As an AI language model, I don't have personal experiences or emotions. Based on my training data, I can provide information on various topics."
-    logger.info("Testing with human text...")
-    human_prob = detector.predict(test_human)
-    logger.info(f"Human text score: Human={human_prob:.2%}, AI={1-human_prob:.2%}")
-    logger.info("Testing with AI text...")
-    ai_prob = detector.predict(test_ai)
-    logger.info(f"AI text score: Human={ai_prob:.2%}, AI={1-ai_prob:.2%}")
 # ---------------- Endpoints ----------------
 @app.on_event("startup")
 async def startup():
-    detector.load_model()
-    # Run test to verify model works correctly
-    test_model()
 @app.get("/")
 async def root():
@@ -203,60 +255,128 @@ async def root():
         "status": "online",
         "model": MODEL_NAME,
         "device": str(detector.device),
-        "version": "2.0.1",
-        "note": "Now returning HUMAN probability as primary score"
     }
 @app.post("/api/scan", response_model=ScanResponse)
-async def scan_text(req: ScanRequest):
     start_time = time.time()
     try:
-        if not req.text or len(req.text.strip()) < 10:
-            raise HTTPException(status_code=400, detail="Text too short.")
-        text = req.text[:5000]
-        # Get HUMAN probability
-        human_prob = detector.predict(text)
-        ai_prob = 1 - human_prob
-        pattern_prob = detect_chatgpt_patterns(text)
-        if pattern_prob > ai_prob:
-            ai_prob = pattern_prob
-            human_prob = 1 - ai_prob
-        if req.scan_type == "highlight":
             sections = analyze_sections(text, chunk_size=40)
-            overall_ai_score = compute_overall_score(sections)
-            overall_human_score = 1 - overall_ai_score
-            ai_sections = [s for s in sections if s["ai_probability"] > 0.5]
             result = {
-                "overall": overall_human_score,  # Human probability
-                "ai_probability": overall_ai_score,  # Explicit AI probability
-                "human_probability": overall_human_score,
                 "model": MODEL_NAME,
-                "confidence": "high" if overall_human_score > 0.75 or overall_human_score < 0.25 else "medium",
-                "chatgpt_detected": pattern_prob > 0,
                 "scan_type": "highlight",
                 "section_count": len(sections),
-                "ai_sections": ai_sections,
-                "label_mapping": detector.label_map  # Include for debugging
             }
-        else:  # Basic scan
             result = {
-                "overall": human_prob,  # Human probability
-                "ai_probability": ai_prob,  # Explicit AI probability
                 "human_probability": human_prob,
                 "model": MODEL_NAME,
-                "confidence": "high" if human_prob > 0.75 or human_prob < 0.25 else "medium",
-                "chatgpt_detected": pattern_prob > 0,
-                "scan_type": "basic",
-                "label_mapping": detector.label_map  # Include for debugging
             }
         processing_time = int((time.time() - start_time) * 1000)
         return ScanResponse(
             success=True,
             result=result,
@@ -269,10 +389,31 @@ async def scan_text(req: ScanRequest):
             },
             test_mode=False
         )
     except Exception as e:
         logger.error(f"Scan error: {e}", exc_info=True)
-        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860, log_level="debug")

 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, validator
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
 import logging
 from typing import Optional, List
 import time
+import sys
 # ---------------- Logging ----------------
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    stream=sys.stdout
+)
 logger = logging.getLogger("detector")
 # ---------------- FastAPI ----------------
 app = FastAPI(
     title="Detextly AI Detector API",
     description="AI Detector with chunked scoring and low-confidence filter",
+    version="2.1.0"
 )
 # CORS
 # ---------------- Models ----------------
 class ScanRequest(BaseModel):
     text: str
+    # Accept both scan_type and scanType
+    scan_type: Optional[str] = None
+    scanType: Optional[str] = None
     userId: Optional[str] = None
+    @validator('scan_type', 'scanType', pre=True, always=True)
+    def determine_scan_type(cls, v, values, field):
+        if field.name == 'scanType' and v:
+            # Map scanType to scan_type for internal use
+            values['scan_type'] = v
+        return v
+    def get_scan_type(self) -> str:
+        """Get the scan type, defaulting to 'basic' if not provided"""
+        return self.scan_type or "basic"
 class ScanResponse(BaseModel):
     success: bool
         self.model = None
         self.tokenizer = None
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.label_map = None
         logger.info(f"Using device: {self.device}")
     def load_model(self):
         if self.model is not None:
             self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
             self.model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
+            # Store label mapping for debugging
             if hasattr(self.model.config, 'id2label'):
                 self.label_map = self.model.config.id2label
+                logger.info(f"Model label mapping: {self.label_map}")
+            else:
+                logger.warning("No label mapping found in model config")
         except Exception as e:
             logger.error(f"Error loading model: {e}")
+            raise RuntimeError(f"Failed to load model: {e}")
         self.model.to(self.device)
         self.model.eval()
         logger.info("Model loaded successfully.")
+    def predict(self, text: str, max_length: int = 512) -> dict:
+        """Return both human and AI probabilities with debugging info"""
         if self.model is None:
             self.load_model()
+        # Tokenize input
         tokens = self.tokenizer(
             text,
             return_tensors="pt",
         with torch.no_grad():
             outputs = self.model(**tokens)
             probs = torch.softmax(outputs.logits, dim=-1)
+            # Get probabilities for both classes
+            human_prob = float(probs[0][0].item())  # Class 0
+            ai_prob = float(probs[0][1].item())     # Class 1
             # Debug logging
             logger.debug(f"Raw probabilities: {probs}")
+            logger.debug(f"Class 0 (Human): {human_prob:.4f}")
+            logger.debug(f"Class 1 (AI): {ai_prob:.4f}")
+            # Verify probabilities sum to ~1.0
+            total = human_prob + ai_prob
+            if abs(total - 1.0) > 0.01:
+                logger.warning(f"Probabilities don't sum to 1.0: {total:.4f}")
+            return {
+                "human_probability": human_prob,
+                "ai_probability": ai_prob,
+                "raw_probs": probs.tolist()
+            }
 detector = AIDetector()
 # ---------------- ChatGPT Pattern Detection ----------------
+def detect_chatgpt_patterns(text: str) -> bool:
+    """Return True if ChatGPT patterns are detected"""
     patterns = [
         "as an ai language model",
         "i am an ai model",
         "my training data",
         "i don't have personal experiences",
         "i don't have feelings",
+        "as an artificial intelligence",
+        "i don't have personal opinions"
     ]
     lower = text.lower()
+    for pattern in patterns:
+        if pattern in lower:
+            logger.debug(f"ChatGPT pattern detected: {pattern}")
+            return True
+    return False
 # ---------------- Highlight / Chunked Scan ----------------
 def analyze_sections(text: str, chunk_size: int = 40) -> List[dict]:
+    """Split text into smaller chunks and compute AI probability for each."""
     sections = []
     words = text.split()
+    total_chunks = (len(words) + chunk_size - 1) // chunk_size
+    logger.info(f"Analyzing {len(words)} words in {total_chunks} chunks")
     for i in range(0, len(words), chunk_size):
         chunk = " ".join(words[i:i+chunk_size])
         if len(chunk.strip()) < 20:
             continue
+        # Get probabilities from model
+        probs = detector.predict(chunk)
+        human_prob = probs["human_probability"]
+        ai_prob = probs["ai_probability"]
+        # Check for ChatGPT patterns
+        has_pattern = detect_chatgpt_patterns(chunk)
+        if has_pattern:
+            ai_prob = max(ai_prob, 0.9)  # Boost AI probability if pattern found
             human_prob = 1 - ai_prob
         sections.append({
+            "text": chunk[:200] + "..." if len(chunk) > 200 else chunk,
+            "ai_probability": round(ai_prob, 4),
+            "human_probability": round(human_prob, 4),
+            "words": len(chunk.split()),
+            "has_chatgpt_pattern": has_pattern
         })
+    logger.info(f"Generated {len(sections)} sections for analysis")
     return sections
+def compute_overall_score(sections: List[dict], confidence_threshold: float = 0.3) -> dict:
+    """Compute weighted average probabilities with confidence filtering."""
     if not sections:
+        return {"ai_probability": 0.0, "human_probability": 1.0, "confidence": "low"}
+    # Filter out low-confidence predictions (close to 0.5)
+    confident_sections = []
+    for section in sections:
+        ai_prob = section["ai_probability"]
+        confidence = abs(ai_prob - 0.5)  # Distance from uncertain (0.5)
+        if confidence >= confidence_threshold:
+            confident_sections.append(section)
+    if not confident_sections:
+        # If no confident sections, use all sections
+        confident_sections = sections
+    # Weighted average by word count
+    total_words = sum(s["words"] for s in confident_sections)
+    if total_words == 0:
+        return {"ai_probability": 0.5, "human_probability": 0.5, "confidence": "low"}
+    weighted_ai_sum = sum(s["ai_probability"] * s["words"] for s in confident_sections)
+    weighted_human_sum = sum(s["human_probability"] * s["words"] for s in confident_sections)
+    overall_ai = weighted_ai_sum / total_words
+    overall_human = weighted_human_sum / total_words
+    # Determine confidence level
+    distance_from_mid = abs(overall_ai - 0.5)
+    if distance_from_mid > 0.4:
+        confidence_level = "high"
+    elif distance_from_mid > 0.2:
+        confidence_level = "medium"
+    else:
+        confidence_level = "low"
+    return {
+        "ai_probability": round(overall_ai, 4),
+        "human_probability": round(overall_human, 4),
+        "confidence": confidence_level,
+        "sections_analyzed": len(sections),
+        "confident_sections": len(confident_sections)
+    }
 # ---------------- Endpoints ----------------
 @app.on_event("startup")
 async def startup():
+    """Initialize the model on startup"""
+    logger.info("Starting Detextly AI Detector API...")
+    try:
+        detector.load_model()
+        logger.info("API startup complete")
+    except Exception as e:
+        logger.error(f"Failed to start API: {e}")
+        raise
 @app.get("/")
 async def root():
         "status": "online",
         "model": MODEL_NAME,
         "device": str(detector.device),
+        "version": "2.1.0",
+        "features": ["basic_scan", "highlight_scan", "chatgpt_pattern_detection"],
+        "endpoints": ["POST /api/scan", "GET /health", "GET /debug/test"]
+    }
+@app.get("/health")
+async def health():
+    health_status = {
+        "status": "healthy",
+        "model_loaded": detector.model is not None,
+        "model": MODEL_NAME,
+        "timestamp": time.time()
+    }
+    return health_status
+@app.get("/debug/test")
+async def debug_test():
+    """Test endpoint to verify model is working correctly"""
+    test_texts = [
+        "I went to the store yesterday to buy groceries.",
+        "As an AI language model, I don't have personal experiences.",
+        "The quick brown fox jumps over the lazy dog."
+    ]
+    results = []
+    for text in test_texts:
+        probs = detector.predict(text)
+        results.append({
+            "text": text[:50] + "..." if len(text) > 50 else text,
+            "human_probability": probs["human_probability"],
+            "ai_probability": probs["ai_probability"]
+        })
+    return {
+        "test_results": results,
+        "model_info": {
+            "name": MODEL_NAME,
+            "labels": detector.label_map,
+            "device": str(detector.device)
+        }
     }
 @app.post("/api/scan", response_model=ScanResponse)
+async def scan_text(request: ScanRequest):
+    """Main scanning endpoint"""
     start_time = time.time()
     try:
+        # Validate input
+        if not request.text or len(request.text.strip()) < 10:
+            raise HTTPException(status_code=400, detail="Text must be at least 10 characters long.")
+        # Get scan type (handles both scan_type and scanType)
+        scan_type = request.get_scan_type()
+        logger.info(f"Scan request: type={scan_type}, userId={request.userId}, text_length={len(request.text)}")
+        # Limit text length for performance
+        text = request.text[:5000]
+        # Check for ChatGPT patterns
+        chatgpt_detected = detect_chatgpt_patterns(text)
+        if scan_type == "highlight":
+            # Chunked analysis
             sections = analyze_sections(text, chunk_size=40)
+            overall = compute_overall_score(sections)
+            # Identify AI-heavy sections
+            ai_sections = [
+                {
+                    "text": s["text"],
+                    "ai_probability": s["ai_probability"],
+                    "human_probability": s["human_probability"],
+                    "words": s["words"]
+                }
+                for s in sections if s["ai_probability"] > 0.6
+            ]
             result = {
+                "overall": overall["human_probability"],  # Human probability for backward compatibility
+                "ai_probability": overall["ai_probability"],
+                "human_probability": overall["human_probability"],
                 "model": MODEL_NAME,
+                "confidence": overall["confidence"],
+                "chatgpt_detected": chatgpt_detected,
                 "scan_type": "highlight",
                 "section_count": len(sections),
+                "ai_section_count": len(ai_sections),
+                "sections_analyzed": overall["sections_analyzed"],
+                "confident_sections": overall["confident_sections"],
+                "ai_sections": ai_sections[:10]  # Limit to first 10
             }
+        else:
+            # Basic scan (single analysis)
+            probs = detector.predict(text)
+            human_prob = probs["human_probability"]
+            ai_prob = probs["ai_probability"]
+            # Boost AI probability if ChatGPT patterns detected
+            if chatgpt_detected:
+                ai_prob = max(ai_prob, 0.9)
+                human_prob = 1 - ai_prob
+            # Determine confidence
+            distance_from_mid = abs(ai_prob - 0.5)
+            confidence = "high" if distance_from_mid > 0.4 else "medium" if distance_from_mid > 0.2 else "low"
             result = {
+                "overall": human_prob,  # Human probability for backward compatibility
+                "ai_probability": ai_prob,
                 "human_probability": human_prob,
                 "model": MODEL_NAME,
+                "confidence": confidence,
+                "chatgpt_detected": chatgpt_detected,
+                "scan_type": "basic"
             }
+        # Calculate processing time
         processing_time = int((time.time() - start_time) * 1000)
+        logger.info(f"Scan completed in {processing_time}ms: AI={result.get('ai_probability', 0):.2%}")
         return ScanResponse(
             success=True,
             result=result,
             },
             test_mode=False
         )
+    except HTTPException:
+        raise
     except Exception as e:
         logger.error(f"Scan error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+@app.get("/api/credits")
+async def get_credits(userId: Optional[str] = None):
+    """Get credits information (for compatibility with worker)"""
+    return {
+        "basic": 5,
+        "highlight": 1,
+        "resetTime": "2024-12-31T23:59:59Z",
+        "test_mode": False,
+        "userId": userId or "unknown"
+    }
+# ---------------- Main Entry Point ----------------
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=7860,
+        log_level="info",
+        access_log=True
+    )