RafzE commited on
Commit
799764f
·
verified ·
1 Parent(s): 8c2a49d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -155
app.py CHANGED
@@ -6,19 +6,23 @@ import torch
6
  import logging
7
  from typing import Optional, List
8
  import time
9
- import re
10
 
11
- # Set up logging
 
 
12
  logging.basicConfig(level=logging.INFO)
13
- logger = logging.getLogger(__name__)
14
 
 
 
 
15
  app = FastAPI(
16
  title="Detextly AI Detector API",
17
- description="AI Content Detection API using RoBERTa",
18
- version="2.0.0" # Updated version
19
  )
20
 
21
- # CORS middleware
22
  app.add_middleware(
23
  CORSMiddleware,
24
  allow_origins=["*"],
@@ -27,7 +31,9 @@ app.add_middleware(
27
  allow_headers=["*"],
28
  )
29
 
30
- # Request/Response models
 
 
31
  class ScanRequest(BaseModel):
32
  text: str
33
  scan_type: str = "basic"
@@ -40,8 +46,10 @@ class ScanResponse(BaseModel):
40
  credits: Optional[dict] = None
41
  test_mode: bool = False
42
 
43
- # Load RoBERTa AI Detector model
44
- MODEL_NAME = "roberta-base-openai-detector"
 
 
45
 
46
  class AIDetector:
47
  def __init__(self):
@@ -49,127 +57,118 @@ class AIDetector:
49
  self.tokenizer = None
50
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
51
  logger.info(f"Using device: {self.device}")
52
-
53
  def load_model(self):
54
- """Lazy load the model"""
55
- if self.model is None:
56
- logger.info("Loading RoBERTa AI Detector model...")
57
- try:
58
- self.model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
59
- self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
60
- logger.info(f"Loaded {MODEL_NAME}")
61
- except Exception as e:
62
- logger.error(f"Failed to load model: {e}")
63
- raise
64
-
65
- self.model.to(self.device)
66
- self.model.eval()
67
- logger.info("Model loaded successfully")
68
-
69
- def predict(self, text: str, max_length: int = 512):
70
- """Predict AI probability"""
71
  if self.model is None:
72
  self.load_model()
73
-
74
- # Tokenize
75
- inputs = self.tokenizer(
76
- text,
77
- return_tensors="pt",
78
- truncation=True,
79
  max_length=max_length,
80
  padding=True
81
  )
82
-
83
- # Move to device
84
- inputs = {k: v.to(self.device) for k, v in inputs.items()}
85
-
86
- # Predict
87
  with torch.no_grad():
88
- outputs = self.model(**inputs)
89
- probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
90
- ai_probability = probabilities[0][0].item() # Index 1 is AI
91
-
 
92
  return ai_probability
93
 
94
- # Initialize detector
95
  detector = AIDetector()
96
 
 
 
 
97
  def detect_chatgpt_patterns(text: str) -> float:
98
- """Detect specific ChatGPT/AI assistant patterns"""
99
- lower_text = text.lower()
100
-
101
- # ChatGPT specific phrases
102
- chatgpt_phrases = [
103
  "as an ai language model",
 
 
 
 
 
104
  "i don't have personal experiences",
105
  "i don't have feelings",
106
- "i'm an ai assistant",
107
- "based on the information provided",
108
- "i cannot provide medical",
109
- "i cannot provide legal",
110
- "i cannot provide financial",
111
- "keep in mind that",
112
- "please note that",
113
- "my responses are generated"
114
  ]
115
-
116
- # Count ChatGPT phrases
117
- chatgpt_count = sum(1 for phrase in chatgpt_phrases if phrase in lower_text)
118
-
119
- # If ChatGPT phrases detected, increase AI probability
120
- if chatgpt_count >= 1:
121
- logger.info(f"ChatGPT patterns detected: {chatgpt_count}")
122
- return 0.95 # Force high AI probability
123
-
124
- return 0.0
125
-
126
- def analyze_sections_roberta(text: str, overall_score: float) -> List[dict]:
127
- """Split text into sections with AI scores for highlight scan"""
128
  sections = []
129
  words = text.split()
130
- section_length = 100
131
-
132
- for i in range(0, len(words), section_length):
133
- section_text = " ".join(words[i:i+section_length])
134
- if len(section_text.strip()) < 50:
 
135
  continue
136
-
137
- # Get section-specific prediction
138
- section_score = detector.predict(section_text) if len(section_text) > 20 else overall_score
139
-
140
- # Check for ChatGPT patterns in each section
141
- chatgpt_adjustment = detect_chatgpt_patterns(section_text)
142
- if chatgpt_adjustment > 0:
143
- section_score = max(section_score, chatgpt_adjustment)
144
-
145
  sections.append({
146
- "text": section_text[:150] + "..." if len(section_text) > 150 else section_text,
147
  "score": section_score,
148
- "words": len(section_text.split()),
149
  "ai_probability": section_score,
150
- "human_probability": 1 - section_score
 
151
  })
152
-
153
- # Limit to 10 sections max
154
  if len(sections) >= 10:
155
  break
156
-
157
  return sections
158
 
 
 
 
159
  @app.on_event("startup")
160
- async def startup_event():
161
- """Pre-load model on startup"""
162
  detector.load_model()
163
 
164
  @app.get("/")
165
  async def root():
166
  return {
167
  "status": "online",
168
- "service": "Detextly AI Detector",
169
- "version": "3.0.0",
170
  "model": MODEL_NAME,
171
  "device": str(detector.device),
172
- "features": ["basic_scan", "highlight_scan", "chatgpt_detection"]
 
173
  }
174
 
175
  @app.get("/health")
@@ -177,75 +176,61 @@ async def health():
177
  return {"status": "healthy", "model": MODEL_NAME}
178
 
179
  @app.post("/api/scan", response_model=ScanResponse)
180
- async def scan_text(request: ScanRequest):
181
- """Main scan endpoint"""
182
- start_time = time.time()
183
-
184
  try:
185
- if not request.text or len(request.text.strip()) < 10:
186
- raise HTTPException(status_code=400, detail="Text too short")
187
-
188
- # Limit text length for performance
189
- text = request.text[:2000]
190
-
191
- # Get prediction from RoBERTa
192
- ai_probability = detector.predict(text)
193
-
194
- # Check for ChatGPT patterns (OVERRIDE if detected)
195
- chatgpt_probability = detect_chatgpt_patterns(text)
196
- if chatgpt_probability > 0:
197
- ai_probability = chatgpt_probability
198
- logger.info(f"ChatGPT detected, overriding to {ai_probability}")
199
-
200
- # Prepare result
201
  result = {
202
- "overall": ai_probability,
203
- "processing_time_ms": int((time.time() - start_time) * 1000),
204
- "simulated": False,
205
- "details": {
206
- "ai_probability": ai_probability,
207
- "human_probability": 1 - ai_probability,
208
- "model": MODEL_NAME,
209
- "confidence": "high" if ai_probability > 0.7 or ai_probability < 0.3 else "medium",
210
- "chatgpt_detected": chatgpt_probability > 0
211
- }
212
  }
213
-
214
- # For highlight scans, add section analysis
215
- if request.scan_type == "highlight":
216
- sections = analyze_sections_roberta(text, ai_probability)
217
  result["sections"] = sections
218
- result["scan_type"] = "highlight"
219
  result["section_count"] = len(sections)
220
- logger.info(f"Highlight scan completed: {len(sections)} sections analyzed")
221
  else:
222
- result["scan_type"] = request.scan_type
223
-
224
- processing_time = int((time.time() - start_time) * 1000)
225
-
226
- # NORMAL credits (5 basic, 1 highlight daily)
227
- credits = {
228
- "basic": 5,
229
- "highlight": 1,
230
- "resetTime": "2024-12-31T23:59:59Z",
231
- "test_mode": False
232
- }
233
-
234
  return ScanResponse(
235
  success=True,
236
  result=result,
237
- processingTime=processing_time,
238
- credits=credits,
 
 
 
 
 
239
  test_mode=False
240
  )
241
-
242
  except Exception as e:
243
  logger.error(f"Scan error: {e}")
244
- raise HTTPException(status_code=500, detail=f"Scan failed: {str(e)}")
245
 
246
  @app.get("/api/credits")
247
- async def get_credits(userId: str):
248
- """Get user credits"""
249
  return {
250
  "basic": 5,
251
  "highlight": 1,
@@ -255,10 +240,4 @@ async def get_credits(userId: str):
255
 
256
  if __name__ == "__main__":
257
  import uvicorn
258
- uvicorn.run(
259
- app,
260
- host="0.0.0.0",
261
- port=7860,
262
- log_level="info"
263
- )
264
-
 
6
  import logging
7
  from typing import Optional, List
8
  import time
 
9
 
10
+ # -------------------------------------------------------
11
+ # Logging
12
+ # -------------------------------------------------------
13
  logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger("detector")
15
 
16
+ # -------------------------------------------------------
17
+ # FastAPI App
18
+ # -------------------------------------------------------
19
  app = FastAPI(
20
  title="Detextly AI Detector API",
21
+ description="AI Content Detection API using RoBERTa-Large",
22
+ version="3.1.0"
23
  )
24
 
25
+ # CORS
26
  app.add_middleware(
27
  CORSMiddleware,
28
  allow_origins=["*"],
 
31
  allow_headers=["*"],
32
  )
33
 
34
+ # -------------------------------------------------------
35
+ # Request / Response Models
36
+ # -------------------------------------------------------
37
  class ScanRequest(BaseModel):
38
  text: str
39
  scan_type: str = "basic"
 
46
  credits: Optional[dict] = None
47
  test_mode: bool = False
48
 
49
+ # -------------------------------------------------------
50
+ # Model
51
+ # -------------------------------------------------------
52
+ MODEL_NAME = "openai-community/roberta-large-openai-detector"
53
 
54
  class AIDetector:
55
  def __init__(self):
 
57
  self.tokenizer = None
58
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
  logger.info(f"Using device: {self.device}")
60
+
61
  def load_model(self):
62
+ if self.model is not None:
63
+ return
64
+
65
+ logger.info(f"Loading model: {MODEL_NAME}")
66
+ try:
67
+ self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
68
+ self.model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
69
+ except Exception as e:
70
+ logger.error(f"Error loading {MODEL_NAME}: {e}")
71
+ raise
72
+
73
+ self.model.to(self.device)
74
+ self.model.eval()
75
+ logger.info("Model loaded successfully.")
76
+
77
+ def predict(self, text: str, max_length: int = 512) -> float:
78
+ """Return AI probability using class index 1 (correct)."""
79
  if self.model is None:
80
  self.load_model()
81
+
82
+ tokens = self.tokenizer(
83
+ text,
84
+ return_tensors="pt",
85
+ truncation=True,
 
86
  max_length=max_length,
87
  padding=True
88
  )
89
+
90
+ tokens = {k: v.to(self.device) for k, v in tokens.items()}
91
+
 
 
92
  with torch.no_grad():
93
+ outputs = self.model(**tokens)
94
+ probabilities = torch.softmax(outputs.logits, dim=-1)
95
+ # FIX: index 1 = AI-written
96
+ ai_probability = float(probabilities[0][1].item())
97
+
98
  return ai_probability
99
 
100
+ # Init detector
101
  detector = AIDetector()
102
 
103
+ # -------------------------------------------------------
104
+ # ChatGPT Pattern Detector
105
+ # -------------------------------------------------------
106
  def detect_chatgpt_patterns(text: str) -> float:
107
+ """Return 0.95 if strong GPT-patterns are detected."""
108
+ patterns = [
 
 
 
109
  "as an ai language model",
110
+ "i am an ai model",
111
+ "i cannot provide medical",
112
+ "as a language model",
113
+ "based on the information provided",
114
+ "my training data",
115
  "i don't have personal experiences",
116
  "i don't have feelings",
 
 
 
 
 
 
 
 
117
  ]
118
+
119
+ lower = text.lower()
120
+ found = any(p in lower for p in patterns)
121
+
122
+ return 0.95 if found else 0.0
123
+
124
+ # -------------------------------------------------------
125
+ # Highlight Scan - Split into Sections
126
+ # -------------------------------------------------------
127
+ def analyze_sections(text: str, overall_score: float) -> List[dict]:
 
 
 
128
  sections = []
129
  words = text.split()
130
+ chunk_size = 100
131
+
132
+ for i in range(0, len(words), chunk_size):
133
+ chunk = " ".join(words[i:i+chunk_size])
134
+
135
+ if len(chunk) < 40:
136
  continue
137
+
138
+ section_score = detector.predict(chunk)
139
+ pattern_score = detect_chatgpt_patterns(chunk)
140
+
141
+ if pattern_score > 0:
142
+ section_score = max(section_score, pattern_score)
143
+
 
 
144
  sections.append({
145
+ "text": chunk[:150] + "..." if len(chunk) > 150 else chunk,
146
  "score": section_score,
 
147
  "ai_probability": section_score,
148
+ "human_probability": 1 - section_score,
149
+ "words": len(chunk.split())
150
  })
151
+
 
152
  if len(sections) >= 10:
153
  break
154
+
155
  return sections
156
 
157
+ # -------------------------------------------------------
158
+ # Endpoints
159
+ # -------------------------------------------------------
160
  @app.on_event("startup")
161
+ async def startup():
 
162
  detector.load_model()
163
 
164
  @app.get("/")
165
  async def root():
166
  return {
167
  "status": "online",
 
 
168
  "model": MODEL_NAME,
169
  "device": str(detector.device),
170
+ "version": "3.1.0",
171
+ "features": ["basic_scan", "highlight_scan", "chatgpt_pattern_detection"]
172
  }
173
 
174
  @app.get("/health")
 
176
  return {"status": "healthy", "model": MODEL_NAME}
177
 
178
  @app.post("/api/scan", response_model=ScanResponse)
179
+ async def scan_text(req: ScanRequest):
180
+ start = time.time()
181
+
 
182
  try:
183
+ if not req.text or len(req.text.strip()) < 10:
184
+ raise HTTPException(status_code=400, detail="Text too short.")
185
+
186
+ text = req.text[:3000] # safe CPU limit
187
+
188
+ # Base prediction
189
+ ai_prob = detector.predict(text)
190
+
191
+ # Pattern override
192
+ pattern_prob = detect_chatgpt_patterns(text)
193
+ if pattern_prob > ai_prob:
194
+ ai_prob = pattern_prob
195
+
196
+ # Build base result
 
 
197
  result = {
198
+ "overall": ai_prob,
199
+ "human_probability": 1 - ai_prob,
200
+ "model": MODEL_NAME,
201
+ "confidence": "high" if ai_prob > 0.75 or ai_prob < 0.25 else "medium",
202
+ "chatgpt_detected": pattern_prob > 0
 
 
 
 
 
203
  }
204
+
205
+ # Highlight scan
206
+ if req.scan_type == "highlight":
207
+ sections = analyze_sections(text, ai_prob)
208
  result["sections"] = sections
 
209
  result["section_count"] = len(sections)
210
+ result["scan_type"] = "highlight"
211
  else:
212
+ result["scan_type"] = "basic"
213
+
214
+ # Return response
 
 
 
 
 
 
 
 
 
215
  return ScanResponse(
216
  success=True,
217
  result=result,
218
+ processingTime=int((time.time() - start) * 1000),
219
+ credits={
220
+ "basic": 5,
221
+ "highlight": 1,
222
+ "resetTime": "2024-12-31T23:59:59Z",
223
+ "test_mode": False
224
+ },
225
  test_mode=False
226
  )
227
+
228
  except Exception as e:
229
  logger.error(f"Scan error: {e}")
230
+ raise HTTPException(status_code=500, detail=str(e))
231
 
232
  @app.get("/api/credits")
233
+ async def credits(userId: str):
 
234
  return {
235
  "basic": 5,
236
  "highlight": 1,
 
240
 
241
  if __name__ == "__main__":
242
  import uvicorn
243
+ uvicorn.run(app, host="0.0.0.0", port=7860)