Jay-Rajput commited on
Commit
4a3b971
·
1 Parent(s): be272f0

ai detector

Browse files
Files changed (3) hide show
  1. app.py +37 -2
  2. requirements.txt +6 -0
  3. text_detector.py +140 -0
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  from fastapi import FastAPI, Header, HTTPException, Depends
3
  from pydantic import BaseModel
4
  from text_humanizer import TextHumanizer, download_nltk_resources
 
5
  import spacy
6
 
7
  API_KEY = os.environ.get("API_KEY", "dev-key")
@@ -9,17 +10,36 @@ PORT = int(os.environ.get("PORT", 7860))
9
 
10
  app = FastAPI()
11
  humanizer = None
 
12
 
 
 
 
13
  class HumanizeReq(BaseModel):
14
  text: str
15
  use_passive: bool = False
16
  use_synonyms: bool = False
17
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def verify_key(x_api_key: str = Header(None)):
19
  if x_api_key != API_KEY:
20
  raise HTTPException(status_code=403, detail="Forbidden")
21
  return True
22
 
 
 
 
23
  @app.get("/")
24
  def greet_json():
25
  return {"Hello": "World!"}
@@ -32,12 +52,27 @@ def startup():
32
  except OSError:
33
  spacy.cli.download("en_core_web_sm")
34
 
35
- global humanizer
36
  humanizer = TextHumanizer()
 
37
 
38
  @app.post("/humanize")
39
  def humanize(req: HumanizeReq, _=Depends(verify_key)):
40
- return {"humanized": humanizer.humanize_text(req.text, req.use_passive, req.use_synonyms)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # if __name__ == "__main__":
43
  # import uvicorn
 
2
  from fastapi import FastAPI, Header, HTTPException, Depends
3
  from pydantic import BaseModel
4
  from text_humanizer import TextHumanizer, download_nltk_resources
5
+ from ai_text_detector import AITextDetector
6
  import spacy
7
 
8
  API_KEY = os.environ.get("API_KEY", "dev-key")
 
10
 
11
  app = FastAPI()
12
  humanizer = None
13
+ detector = None
14
 
15
+ # =========================
16
+ # Request / Response Models
17
+ # =========================
18
  class HumanizeReq(BaseModel):
19
  text: str
20
  use_passive: bool = False
21
  use_synonyms: bool = False
22
 
23
+ class DetectReq(BaseModel):
24
+ text: str
25
+
26
+ class DetectResp(BaseModel):
27
+ ai_probability: float
28
+ human_probability: float
29
+ classification: str
30
+ metrics: dict
31
+
32
+ # =========================
33
+ # API Key verification
34
+ # =========================
35
  def verify_key(x_api_key: str = Header(None)):
36
  if x_api_key != API_KEY:
37
  raise HTTPException(status_code=403, detail="Forbidden")
38
  return True
39
 
40
+ # =========================
41
+ # Routes
42
+ # =========================
43
  @app.get("/")
44
  def greet_json():
45
  return {"Hello": "World!"}
 
52
  except OSError:
53
  spacy.cli.download("en_core_web_sm")
54
 
55
+ global humanizer, detector
56
  humanizer = TextHumanizer()
57
+ detector = AITextDetector() # <-- init detector here
58
 
59
  @app.post("/humanize")
60
  def humanize(req: HumanizeReq, _=Depends(verify_key)):
61
+ return {
62
+ "humanized": humanizer.humanize_text(
63
+ req.text,
64
+ req.use_passive,
65
+ req.use_synonyms
66
+ )
67
+ }
68
+
69
+ @app.post("/detect", response_model=DetectResp)
70
+ def detect(req: DetectReq, _=Depends(verify_key)):
71
+ """
72
+ Detect whether the text is AI-generated or human-written.
73
+ """
74
+ report = detector.generate_report(req.text)
75
+ return DetectResp(**report)
76
 
77
  # if __name__ == "__main__":
78
  # import uvicorn
requirements.txt CHANGED
@@ -2,4 +2,10 @@ fastapi
2
  uvicorn[standard]
3
  spacy
4
  nltk
 
 
5
  sentence-transformers
 
 
 
 
 
2
  uvicorn[standard]
3
  spacy
4
  nltk
5
+ numpy
6
+ torch
7
  sentence-transformers
8
+ scikit-learn
9
+ scipy
10
+ transformers
11
+ pandas
text_detector.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import statistics
3
+ import numpy as np
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
+ from collections import Counter
7
+
8
+
9
+ class AITextDetector:
10
+ """
11
+ Advanced AI Text Detector (2025-ready):
12
+ - Transformer classifier for AI vs Human
13
+ - Metrics: perplexity, burstiness, repetition, semantic smoothness
14
+ - Category distribution (4-way classification for interpretability)
15
+ """
16
+
17
+ def __init__(self, model_name="roberta-base-openai-detector", device=None):
18
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+ self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
20
+
21
+ if device:
22
+ self.device = device
23
+ else:
24
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
25
+
26
+ self.model.to(self.device)
27
+ self.model.eval()
28
+
29
+ def _compute_perplexity(self, text: str) -> float:
30
+ """
31
+ Approximate perplexity using NLL from model.
32
+ """
33
+ encodings = self.tokenizer(text, return_tensors="pt", truncation=True)
34
+ input_ids = encodings.input_ids.to(self.device)
35
+
36
+ with torch.no_grad():
37
+ outputs = self.model(input_ids, labels=input_ids)
38
+ loss = outputs.loss.item()
39
+
40
+ return math.exp(loss)
41
+
42
+ def _compute_burstiness(self, text: str) -> float:
43
+ """
44
+ Burstiness = variance / mean of sentence lengths.
45
+ """
46
+ sentences = [s.strip() for s in text.split(".") if s.strip()]
47
+ lengths = [len(s.split()) for s in sentences]
48
+
49
+ if len(lengths) < 2:
50
+ return 0.0
51
+
52
+ return statistics.pvariance(lengths) / (np.mean(lengths) + 1e-8)
53
+
54
+ def _compute_repetition_score(self, text: str) -> float:
55
+ """
56
+ Measures how often words repeat.
57
+ High repetition = more likely AI.
58
+ """
59
+ words = [w.lower() for w in text.split() if w.isalpha()]
60
+ if not words:
61
+ return 0.0
62
+
63
+ word_counts = Counter(words)
64
+ repetition = sum(c - 1 for c in word_counts.values() if c > 1) / len(words)
65
+ return repetition
66
+
67
+ def _compute_semantic_smoothness(self, text: str) -> float:
68
+ """
69
+ Semantic smoothness = similarity between consecutive sentences.
70
+ Higher = more consistent flow (AI often too smooth).
71
+ """
72
+ sentences = [s.strip() for s in text.split(".") if s.strip()]
73
+ if len(sentences) < 2:
74
+ return 1.0
75
+
76
+ embeddings = self.model.base_model.get_input_embeddings()(
77
+ torch.tensor([self.tokenizer.encode(s, truncation=True, max_length=32) for s in sentences]).to(self.device)
78
+ )
79
+ embeddings = embeddings.mean(dim=1).detach().cpu().numpy()
80
+
81
+ sims = []
82
+ for i in range(len(embeddings) - 1):
83
+ v1, v2 = embeddings[i], embeddings[i + 1]
84
+ cos = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2) + 1e-8)
85
+ sims.append(cos)
86
+
87
+ return float(np.mean(sims))
88
+
89
+ def detect(self, text: str) -> dict:
90
+ """
91
+ Run detection and return structured JSON report.
92
+ """
93
+
94
+ # Model classification
95
+ inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
96
+ with torch.no_grad():
97
+ logits = self.model(**inputs).logits
98
+ probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
99
+
100
+ ai_prob = float(probs[1]) if len(probs) > 1 else 0.5
101
+
102
+ # Compute metrics
103
+ perplexity = self._compute_perplexity(text)
104
+ burstiness = self._compute_burstiness(text)
105
+ repetition = self._compute_repetition_score(text)
106
+ smoothness = self._compute_semantic_smoothness(text)
107
+
108
+ # Create 4-category distribution (mock scaling from ai_prob + heuristics)
109
+ distribution = {
110
+ "AI-generated": round(ai_prob * 100 * (1 - repetition), 1),
111
+ "AI-generated & AI-refined": round(ai_prob * 100 * repetition, 1),
112
+ "Human-written & AI-refined": round((1 - ai_prob) * 100 * smoothness, 1),
113
+ "Human-written": round((1 - ai_prob) * 100 * (1 - smoothness), 1)
114
+ }
115
+
116
+ # Normalize so they sum to 100
117
+ total = sum(distribution.values())
118
+ if total > 0:
119
+ distribution = {k: round(v / total * 100, 1) for k, v in distribution.items()}
120
+
121
+ overall_ai_probability = round(ai_prob, 2)
122
+ summary = f"{distribution['AI-generated']}% of text is likely AI"
123
+
124
+ return {
125
+ "summary": summary,
126
+ "overall_ai_probability": overall_ai_probability,
127
+ "category_distribution": distribution,
128
+ "metrics": {
129
+ "perplexity": round(perplexity, 2),
130
+ "burstiness": round(burstiness, 3),
131
+ "repetition_score": round(repetition, 3),
132
+ "semantic_smoothness": round(smoothness, 3),
133
+ "ai_probability": overall_ai_probability
134
+ },
135
+ "interpretation": (
136
+ "This detector uses structural patterns (perplexity, burstiness, repetition, semantic smoothness) "
137
+ "to estimate the likelihood of AI authorship. Results are probabilistic, not definitive. "
138
+ "Always apply judgment."
139
+ )
140
+ }