Spaces:

Jay-Rajput
/

humanizer

Runtime error

App Files Files Community

Jay-Rajput commited on Sep 10, 2025

Commit

4a3b971

1 Parent(s): be272f0

ai detector

Browse files

Files changed (3) hide show

app.py +37 -2
requirements.txt +6 -0
text_detector.py +140 -0

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 from fastapi import FastAPI, Header, HTTPException, Depends
 from pydantic import BaseModel
 from text_humanizer import TextHumanizer, download_nltk_resources
 import spacy
 API_KEY = os.environ.get("API_KEY", "dev-key")
@@ -9,17 +10,36 @@ PORT = int(os.environ.get("PORT", 7860))
 app = FastAPI()
 humanizer = None
 class HumanizeReq(BaseModel):
     text: str
     use_passive: bool = False
     use_synonyms: bool = False
 def verify_key(x_api_key: str = Header(None)):
     if x_api_key != API_KEY:
         raise HTTPException(status_code=403, detail="Forbidden")
     return True
 @app.get("/")
 def greet_json():
     return {"Hello": "World!"}
@@ -32,12 +52,27 @@ def startup():
     except OSError:
         spacy.cli.download("en_core_web_sm")
-    global humanizer
     humanizer = TextHumanizer()
 @app.post("/humanize")
 def humanize(req: HumanizeReq, _=Depends(verify_key)):
-    return {"humanized": humanizer.humanize_text(req.text, req.use_passive, req.use_synonyms)}
 # if __name__ == "__main__":
 #     import uvicorn

 from fastapi import FastAPI, Header, HTTPException, Depends
 from pydantic import BaseModel
 from text_humanizer import TextHumanizer, download_nltk_resources
+from ai_text_detector import AITextDetector
 import spacy
 API_KEY = os.environ.get("API_KEY", "dev-key")
 app = FastAPI()
 humanizer = None
+detector = None
+# =========================
+# Request / Response Models
+# =========================
 class HumanizeReq(BaseModel):
     text: str
     use_passive: bool = False
     use_synonyms: bool = False
+class DetectReq(BaseModel):
+    text: str
+class DetectResp(BaseModel):
+    ai_probability: float
+    human_probability: float
+    classification: str
+    metrics: dict
+# =========================
+# API Key verification
+# =========================
 def verify_key(x_api_key: str = Header(None)):
     if x_api_key != API_KEY:
         raise HTTPException(status_code=403, detail="Forbidden")
     return True
+# =========================
+# Routes
+# =========================
 @app.get("/")
 def greet_json():
     return {"Hello": "World!"}
     except OSError:
         spacy.cli.download("en_core_web_sm")
+    global humanizer, detector
     humanizer = TextHumanizer()
+    detector = AITextDetector()   # <-- init detector here
 @app.post("/humanize")
 def humanize(req: HumanizeReq, _=Depends(verify_key)):
+    return {
+        "humanized": humanizer.humanize_text(
+            req.text,
+            req.use_passive,
+            req.use_synonyms
+        )
+    }
+@app.post("/detect", response_model=DetectResp)
+def detect(req: DetectReq, _=Depends(verify_key)):
+    """
+    Detect whether the text is AI-generated or human-written.
+    """
+    report = detector.generate_report(req.text)
+    return DetectResp(**report)
 # if __name__ == "__main__":
 #     import uvicorn

requirements.txt CHANGED Viewed

@@ -2,4 +2,10 @@ fastapi
 uvicorn[standard]
 spacy
 nltk
 sentence-transformers

 uvicorn[standard]
 spacy
 nltk
+numpy
+torch
 sentence-transformers
+scikit-learn
+scipy
+transformers
+pandas

text_detector.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import math
+import statistics
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from collections import Counter
+class AITextDetector:
+    """
+    Advanced AI Text Detector (2025-ready):
+    - Transformer classifier for AI vs Human
+    - Metrics: perplexity, burstiness, repetition, semantic smoothness
+    - Category distribution (4-way classification for interpretability)
+    """
+    def __init__(self, model_name="roberta-base-openai-detector", device=None):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
+        if device:
+            self.device = device
+        else:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+        self.model.eval()
+    def _compute_perplexity(self, text: str) -> float:
+        """
+        Approximate perplexity using NLL from model.
+        """
+        encodings = self.tokenizer(text, return_tensors="pt", truncation=True)
+        input_ids = encodings.input_ids.to(self.device)
+        with torch.no_grad():
+            outputs = self.model(input_ids, labels=input_ids)
+            loss = outputs.loss.item()
+        return math.exp(loss)
+    def _compute_burstiness(self, text: str) -> float:
+        """
+        Burstiness = variance / mean of sentence lengths.
+        """
+        sentences = [s.strip() for s in text.split(".") if s.strip()]
+        lengths = [len(s.split()) for s in sentences]
+        if len(lengths) < 2:
+            return 0.0
+        return statistics.pvariance(lengths) / (np.mean(lengths) + 1e-8)
+    def _compute_repetition_score(self, text: str) -> float:
+        """
+        Measures how often words repeat.
+        High repetition = more likely AI.
+        """
+        words = [w.lower() for w in text.split() if w.isalpha()]
+        if not words:
+            return 0.0
+        word_counts = Counter(words)
+        repetition = sum(c - 1 for c in word_counts.values() if c > 1) / len(words)
+        return repetition
+    def _compute_semantic_smoothness(self, text: str) -> float:
+        """
+        Semantic smoothness = similarity between consecutive sentences.
+        Higher = more consistent flow (AI often too smooth).
+        """
+        sentences = [s.strip() for s in text.split(".") if s.strip()]
+        if len(sentences) < 2:
+            return 1.0
+        embeddings = self.model.base_model.get_input_embeddings()(
+            torch.tensor([self.tokenizer.encode(s, truncation=True, max_length=32) for s in sentences]).to(self.device)
+        )
+        embeddings = embeddings.mean(dim=1).detach().cpu().numpy()
+        sims = []
+        for i in range(len(embeddings) - 1):
+            v1, v2 = embeddings[i], embeddings[i + 1]
+            cos = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2) + 1e-8)
+            sims.append(cos)
+        return float(np.mean(sims))
+    def detect(self, text: str) -> dict:
+        """
+        Run detection and return structured JSON report.
+        """
+        # Model classification
+        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
+        with torch.no_grad():
+            logits = self.model(**inputs).logits
+            probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
+        ai_prob = float(probs[1]) if len(probs) > 1 else 0.5
+        # Compute metrics
+        perplexity = self._compute_perplexity(text)
+        burstiness = self._compute_burstiness(text)
+        repetition = self._compute_repetition_score(text)
+        smoothness = self._compute_semantic_smoothness(text)
+        # Create 4-category distribution (mock scaling from ai_prob + heuristics)
+        distribution = {
+            "AI-generated": round(ai_prob * 100 * (1 - repetition), 1),
+            "AI-generated & AI-refined": round(ai_prob * 100 * repetition, 1),
+            "Human-written & AI-refined": round((1 - ai_prob) * 100 * smoothness, 1),
+            "Human-written": round((1 - ai_prob) * 100 * (1 - smoothness), 1)
+        }
+        # Normalize so they sum to 100
+        total = sum(distribution.values())
+        if total > 0:
+            distribution = {k: round(v / total * 100, 1) for k, v in distribution.items()}
+        overall_ai_probability = round(ai_prob, 2)
+        summary = f"{distribution['AI-generated']}% of text is likely AI"
+        return {
+            "summary": summary,
+            "overall_ai_probability": overall_ai_probability,
+            "category_distribution": distribution,
+            "metrics": {
+                "perplexity": round(perplexity, 2),
+                "burstiness": round(burstiness, 3),
+                "repetition_score": round(repetition, 3),
+                "semantic_smoothness": round(smoothness, 3),
+                "ai_probability": overall_ai_probability
+            },
+            "interpretation": (
+                "This detector uses structural patterns (perplexity, burstiness, repetition, semantic smoothness) "
+                "to estimate the likelihood of AI authorship. Results are probabilistic, not definitive. "
+                "Always apply judgment."
+            )
+        }