mahmoudsaber0 commited on
Commit
18e2e5a
·
verified ·
1 Parent(s): 0dbeec3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +502 -163
app.py CHANGED
@@ -1,83 +1,57 @@
1
  import os
2
  import re
3
- import shutil
4
  import torch
5
- from fastapi import FastAPI
 
 
 
 
6
  from pydantic import BaseModel
7
-
8
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
  from tokenizers.normalizers import Sequence, Replace, Strip
10
  from tokenizers import Regex
11
- import atexit
12
 
13
  # =====================================================
14
- # Safe Cache Setup (Runtime)
15
  # =====================================================
16
- CACHE_DIR = "/tmp/huggingface"
17
-
18
- # Clear any old cache to prevent exceeding 50G
19
- if os.path.exists(CACHE_DIR):
20
- shutil.rmtree(CACHE_DIR, ignore_errors=True)
 
 
 
21
  os.makedirs(CACHE_DIR, exist_ok=True)
22
 
23
- # Set environment paths
24
  os.environ.update({
25
  "HF_HOME": CACHE_DIR,
26
  "TRANSFORMERS_CACHE": CACHE_DIR,
27
  "HF_DATASETS_CACHE": CACHE_DIR,
28
- "HF_HUB_CACHE": CACHE_DIR,
29
  "TORCH_HOME": CACHE_DIR,
30
- "XDG_CACHE_HOME": CACHE_DIR,
31
- "TORCHINDUCTOR_CACHE_DIR": CACHE_DIR,
32
- "TORCH_LOGS": "off"
33
  })
34
 
35
- # Auto cleanup on shutdown
36
- @atexit.register
37
- def cleanup_cache():
38
- shutil.rmtree(CACHE_DIR, ignore_errors=True)
39
 
40
  # =====================================================
41
- # Model Setup
42
  # =====================================================
43
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
44
-
45
- tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
46
-
47
- # --- Model paths ---
48
- model1_path = "modernbert.bin"
49
- model2_url = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12"
50
- model3_url = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
51
-
52
- def load_model(base_path=None, url=None):
53
- model = AutoModelForSequenceClassification.from_pretrained(
54
- "answerdotai/ModernBERT-base", num_labels=41
55
- )
56
- if url:
57
- state_dict = torch.hub.load_state_dict_from_url(
58
- url, map_location=device, progress=False, check_hash=False
59
-
60
-
61
-
62
-
63
-
64
-
65
-
66
-
67
-
68
- )
69
- else:
70
- state_dict = torch.load(base_path, map_location=device)
71
- model.load_state_dict(state_dict)
72
- model.to(device).eval()
73
- return model
74
-
75
- model_1 = load_model(model1_path)
76
- model_2 = load_model(url=model2_url)
77
- model_3 = load_model(url=model3_url)
78
 
79
  # =====================================================
80
- # Label Mapping
81
  # =====================================================
82
  label_mapping = {
83
  0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
@@ -93,128 +67,493 @@ label_mapping = {
93
  }
94
 
95
  # =====================================================
96
- # Text Cleaning & Tokenizer Normalization
97
  # =====================================================
98
- def clean_text(text: str) -> str:
99
- text = re.sub(r'\s{2,}', ' ', text)
100
- text = re.sub(r'\s+([,.;:?!])', r'\1', text)
101
- return text
102
-
103
- newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
104
- join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
105
- tokenizer.backend_tokenizer.normalizer = Sequence([
106
- tokenizer.backend_tokenizer.normalizer,
107
- join_hyphen_break,
108
- newline_to_space,
109
- Strip()
110
- ])
111
-
112
- # =====================================================
113
- # ✅ Analysis Logic
114
- # =====================================================
115
- def analyze_text_block(text: str):
116
- cleaned_text = clean_text(text)
117
- inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True).to(device)
118
-
119
- with torch.no_grad():
120
- logits_1 = model_1(**inputs).logits
121
- logits_2 = model_2(**inputs).logits
122
- logits_3 = model_3(**inputs).logits
123
-
124
- avg_probs = (
125
- torch.softmax(logits_1, dim=1) +
126
- torch.softmax(logits_2, dim=1) +
127
- torch.softmax(logits_3, dim=1)
128
- ) / 3
129
-
130
- probs = avg_probs[0]
131
- human_prob = probs[24].item()
132
- ai_probs = probs.clone()
133
- ai_probs[24] = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  ai_total_prob = ai_probs.sum().item()
135
-
 
136
  total = human_prob + ai_total_prob
137
- human_percentage = (human_prob / total) * 100
138
- ai_percentage = (ai_total_prob / total) * 100
139
- ai_model_index = torch.argmax(ai_probs).item()
140
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  return {
142
- "human_written_score": round(human_percentage / 100, 4),
143
- "ai_generated_score": round(ai_percentage / 100, 4),
144
- "predicted_model": label_mapping[ai_model_index]
 
 
 
145
  }
146
 
147
- def split_into_paragraphs(text: str):
148
- return [p.strip() for p in re.split(r'\n\s*\n', text.strip()) if p.strip()]
149
-
150
  # =====================================================
151
- # FastAPI Setup
152
  # =====================================================
153
- app = FastAPI(title="ModernBERT AI Text Detector")
154
-
155
- class InputText(BaseModel):
156
- text: str
157
-
158
- @app.get("/health")
159
- async def health():
160
- return {"status": "ok"}
161
-
162
-
163
-
164
-
165
-
166
-
167
 
 
 
 
 
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
 
 
 
 
 
 
170
 
171
- @app.post("/analyze")
172
- async def analyze(data: InputText):
173
- text = data.text.strip()
174
- if not text:
175
- return {"success": False, "code": 400, "message": "Empty input text"}
176
 
177
- total_words = len(text.split())
178
- full_result = analyze_text_block(text)
179
- fake_percentage = round(full_result["ai_generated_score"] * 100, 2)
180
- ai_words = int(total_words * (fake_percentage / 100))
181
- results = []
182
 
183
- if fake_percentage > 50:
184
- paragraphs = split_into_paragraphs(text)
185
- ai_words, total_words = 0, 0
186
- for p in paragraphs:
187
- res = analyze_text_block(p)
188
- wc = len(p.split())
189
- total_words += wc
190
- ai_words += wc * res["ai_generated_score"]
191
- results.append({
192
- "paragraph": p,
193
- "ai_generated_score": res["ai_generated_score"],
194
- "human_written_score": res["human_written_score"],
195
- "predicted_model": res["predicted_model"]
196
- })
197
- fake_percentage = round((ai_words / total_words) * 100, 2)
 
 
 
 
 
198
 
199
- feedback = (
200
- "Most of Your Text is AI/GPT Generated"
201
- if fake_percentage > 50
202
- else "Most of Your Text Appears Human-Written"
203
- )
 
 
 
 
 
 
 
 
 
 
204
 
 
 
 
 
 
 
 
 
 
 
205
  return {
206
- "success": True,
207
- "code": 200,
208
- "message": "analysis completed",
209
- "data": {
210
- "fakePercentage": fake_percentage,
211
- "isHuman": round(100 - fake_percentage, 2),
212
- "textWords": total_words,
213
- "aiWords": ai_words,
214
- "paragraphs": results,
215
- "predicted_model": full_result["predicted_model"],
216
- "feedback": feedback,
217
- "input_text": text,
218
- "detected_language": "en"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  }
220
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
 
3
  import torch
4
+ import logging
5
+ import gc
6
+ import sys
7
+ from fastapi import FastAPI, HTTPException
8
+ from fastapi.middleware.cors import CORSMiddleware
9
  from pydantic import BaseModel
10
+ from typing import Dict, List, Optional
11
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
  from tokenizers.normalizers import Sequence, Replace, Strip
13
  from tokenizers import Regex
 
14
 
15
  # =====================================================
16
+ # 🔧 تكوين البيئة والإعدادات
17
  # =====================================================
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s - %(levelname)s - %(message)s'
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # إعدادات الذاكرة والكاش
25
+ CACHE_DIR = "/tmp/huggingface_cache"
26
  os.makedirs(CACHE_DIR, exist_ok=True)
27
 
28
+ # تكوين متغيرات البيئة لـ Hugging Face
29
  os.environ.update({
30
  "HF_HOME": CACHE_DIR,
31
  "TRANSFORMERS_CACHE": CACHE_DIR,
32
  "HF_DATASETS_CACHE": CACHE_DIR,
33
+ "HUGGINGFACE_HUB_CACHE": CACHE_DIR,
34
  "TORCH_HOME": CACHE_DIR,
35
+ "TOKENIZERS_PARALLELISM": "false", # منع مشاكل threading
36
+ "TRANSFORMERS_OFFLINE": "0", # السماح بالتحميل من الإنترنت
 
37
  })
38
 
39
+ # إعدادات PyTorch للذاكرة
40
+ if torch.cuda.is_available():
41
+ os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
42
+ torch.backends.cudnn.benchmark = True
43
 
44
  # =====================================================
45
+ # 🚀 تحديد الجهاز (GPU أو CPU)
46
  # =====================================================
47
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
48
+ logger.info(f"🖥️ Using device: {device}")
49
+ if torch.cuda.is_available():
50
+ logger.info(f"🎮 CUDA Device: {torch.cuda.get_device_name(0)}")
51
+ logger.info(f"💾 CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # =====================================================
54
+ # 📊 خريطة الموديلات
55
  # =====================================================
56
  label_mapping = {
57
  0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
 
67
  }
68
 
69
  # =====================================================
70
+ # 🤖 Model Manager - إدارة الموديلات
71
  # =====================================================
72
+ class ModelManager:
73
+ def __init__(self):
74
+ self.tokenizer = None
75
+ self.models = []
76
+ self.models_loaded = False
77
+ self.model_urls = [
78
+ "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
79
+ "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
80
+ ]
81
+
82
+ def load_tokenizer(self):
83
+ """تحميل الـ Tokenizer مع معالجة الأخطاء"""
84
+ try:
85
+ logger.info("📝 Loading tokenizer...")
86
+ self.tokenizer = AutoTokenizer.from_pretrained(
87
+ "answerdotai/ModernBERT-base",
88
+ cache_dir=CACHE_DIR,
89
+ use_fast=True,
90
+ trust_remote_code=False
91
+ )
92
+
93
+ # إعداد معالج النصوص
94
+ try:
95
+ newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
96
+ join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
97
+ self.tokenizer.backend_tokenizer.normalizer = Sequence([
98
+ self.tokenizer.backend_tokenizer.normalizer,
99
+ join_hyphen_break,
100
+ newline_to_space,
101
+ Strip()
102
+ ])
103
+ except Exception as e:
104
+ logger.warning(f"⚠️ Could not set custom normalizer: {e}")
105
+
106
+ logger.info("✅ Tokenizer loaded successfully")
107
+ return True
108
+
109
+ except Exception as e:
110
+ logger.error(f"❌ Failed to load tokenizer: {e}")
111
+ return False
112
+
113
+ def load_single_model(self, model_url=None, model_path=None, model_name="Model"):
114
+ """تحميل موديل واحد مع معالجة شاملة للأخطاء"""
115
+ try:
116
+ logger.info(f"🤖 Loading {model_name}...")
117
+
118
+ # إنشاء الموديل الأساسي
119
+ base_model = AutoModelForSequenceClassification.from_pretrained(
120
+ "answerdotai/ModernBERT-base",
121
+ num_labels=41,
122
+ cache_dir=CACHE_DIR,
123
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
124
+ low_cpu_mem_usage=True,
125
+ trust_remote_code=False
126
+ )
127
+
128
+ # محاولة تحميل الأوزان
129
+ if model_path and os.path.exists(model_path):
130
+ logger.info(f"📁 Loading from local file: {model_path}")
131
+ state_dict = torch.load(model_path, map_location=device, weights_only=True)
132
+ base_model.load_state_dict(state_dict, strict=False)
133
+ elif model_url:
134
+ logger.info(f"🌐 Downloading weights from: {model_url}")
135
+ try:
136
+ state_dict = torch.hub.load_state_dict_from_url(
137
+ model_url,
138
+ map_location=device,
139
+ progress=True,
140
+ check_hash=False,
141
+ file_name=f"{model_name}.pt"
142
+ )
143
+ base_model.load_state_dict(state_dict, strict=False)
144
+ except Exception as url_error:
145
+ logger.warning(f"⚠️ Could not load weights from URL: {url_error}")
146
+ logger.info("📊 Using model with random initialization")
147
+ else:
148
+ logger.info("📊 Using model with random initialization")
149
+
150
+ # نقل الموديل للجهاز المناسب
151
+ model = base_model.to(device)
152
+ model.eval()
153
+
154
+ # تنظيف الذاكرة
155
+ if 'state_dict' in locals():
156
+ del state_dict
157
+ gc.collect()
158
+ if torch.cuda.is_available():
159
+ torch.cuda.empty_cache()
160
+
161
+ logger.info(f"✅ {model_name} loaded successfully")
162
+ return model
163
+
164
+ except Exception as e:
165
+ logger.error(f"❌ Failed to load {model_name}: {e}")
166
+ return None
167
+
168
+ def load_models(self, max_models=2):
169
+ """تحميل الموديلات بحد أقصى للذاكرة"""
170
+ if self.models_loaded:
171
+ logger.info("✨ Models already loaded")
172
+ return True
173
+
174
+ # تحميل الـ Tokenizer أولاً
175
+ if not self.load_tokenizer():
176
+ return False
177
+
178
+ # تحميل الموديلات
179
+ logger.info(f"🚀 Loading up to {max_models} models...")
180
+
181
+ # محاولة تحميل الملف المحلي أولاً
182
+ local_model_path = "modernbert.bin"
183
+ if os.path.exists(local_model_path):
184
+ model = self.load_single_model(
185
+ model_path=local_model_path,
186
+ model_name="Model 1 (Local)"
187
+ )
188
+ if model is not None:
189
+ self.models.append(model)
190
+
191
+ # تحميل الموديلات من URLs
192
+ for i, url in enumerate(self.model_urls[:max_models - len(self.models)]):
193
+ if len(self.models) >= max_models:
194
+ break
195
+
196
+ model = self.load_single_model(
197
+ model_url=url,
198
+ model_name=f"Model {len(self.models) + 1}"
199
+ )
200
+ if model is not None:
201
+ self.models.append(model)
202
+
203
+ # التحقق من الذاكرة المتاحة
204
+ if torch.cuda.is_available():
205
+ mem_allocated = torch.cuda.memory_allocated() / 1024**3
206
+ mem_reserved = torch.cuda.memory_reserved() / 1024**3
207
+ logger.info(f"💾 GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
208
+
209
+ # إيقاف التحميل إذا كانت الذاكرة ممتلئة
210
+ if mem_allocated > 6: # حد أقصى 6GB
211
+ logger.warning("⚠️ Memory limit reached, stopping model loading")
212
+ break
213
+
214
+ # التحقق من نجاح التحميل
215
+ if len(self.models) > 0:
216
+ self.models_loaded = True
217
+ logger.info(f"✅ Successfully loaded {len(self.models)} models")
218
+ return True
219
+ else:
220
+ logger.error("❌ No models could be loaded")
221
+ return False
222
+
223
+ def classify_text(self, text: str) -> Dict:
224
+ """تحليل النص باستخدام الموديلات المحملة"""
225
+ if not self.models_loaded or len(self.models) == 0:
226
+ raise ValueError("No models loaded")
227
+
228
+ # تنظيف النص
229
+ cleaned_text = clean_text(text)
230
+ if not cleaned_text.strip():
231
+ raise ValueError("Empty text after cleaning")
232
+
233
+ # Tokenization
234
+ try:
235
+ inputs = self.tokenizer(
236
+ cleaned_text,
237
+ return_tensors="pt",
238
+ truncation=True,
239
+ max_length=512,
240
+ padding=True
241
+ ).to(device)
242
+ except Exception as e:
243
+ logger.error(f"Tokenization error: {e}")
244
+ raise ValueError(f"Failed to tokenize text: {e}")
245
+
246
+ # الحصول على التنبؤات
247
+ all_probabilities = []
248
+
249
+ with torch.no_grad():
250
+ for i, model in enumerate(self.models):
251
+ try:
252
+ logits = model(**inputs).logits
253
+ probs = torch.softmax(logits, dim=1)
254
+ all_probabilities.append(probs)
255
+ except Exception as e:
256
+ logger.warning(f"Model {i+1} prediction failed: {e}")
257
+ continue
258
+
259
+ if not all_probabilities:
260
+ raise ValueError("All models failed to make predictions")
261
+
262
+ # حساب المتوسط (Soft Voting)
263
+ averaged_probs = torch.mean(torch.stack(all_probabilities), dim=0)
264
+ probabilities = averaged_probs[0]
265
+
266
+ # حساب نسب Human vs AI
267
+ human_prob = probabilities[24].item()
268
+ ai_probs = probabilities.clone()
269
+ ai_probs[24] = 0 # إزالة احتمالية Human
270
  ai_total_prob = ai_probs.sum().item()
271
+
272
+ # التطبيع
273
  total = human_prob + ai_total_prob
274
+ if total > 0:
275
+ human_percentage = (human_prob / total) * 100
276
+ ai_percentage = (ai_total_prob / total) * 100
277
+ else:
278
+ human_percentage = 50
279
+ ai_percentage = 50
280
+
281
+ # تحديد الموديل الأكثر احتمالاً
282
+ ai_model_idx = torch.argmax(ai_probs).item()
283
+ predicted_model = label_mapping.get(ai_model_idx, "Unknown")
284
+
285
+ # أعلى 5 تنبؤات
286
+ top_5_probs, top_5_indices = torch.topk(probabilities, 5)
287
+ top_5_results = []
288
+ for prob, idx in zip(top_5_probs, top_5_indices):
289
+ top_5_results.append({
290
+ "model": label_mapping.get(idx.item(), "Unknown"),
291
+ "probability": round(prob.item() * 100, 2)
292
+ })
293
+
294
  return {
295
+ "human_percentage": round(human_percentage, 2),
296
+ "ai_percentage": round(ai_percentage, 2),
297
+ "predicted_model": predicted_model,
298
+ "top_5_predictions": top_5_results,
299
+ "is_human": human_percentage > ai_percentage,
300
+ "models_used": len(all_probabilities)
301
  }
302
 
 
 
 
303
  # =====================================================
304
+ # 🧹 دوال التنظيف والمعالجة
305
  # =====================================================
306
+ def clean_text(text: str) -> str:
307
+ """تنظيف النص من المسافات الزائدة"""
308
+ text = re.sub(r'\s{2,}', ' ', text)
309
+ text = re.sub(r'\s+([,.;:?!])', r'\1', text)
310
+ return text.strip()
 
 
 
 
 
 
 
 
 
311
 
312
+ def split_into_paragraphs(text: str) -> List[str]:
313
+ """تقسيم النص إلى فقرات"""
314
+ paragraphs = re.split(r'\n\s*\n', text.strip())
315
+ return [p.strip() for p in paragraphs if p.strip()]
316
 
317
+ # =====================================================
318
+ # 🌐 FastAPI Application
319
+ # =====================================================
320
+ app = FastAPI(
321
+ title="ModernBERT AI Text Detector",
322
+ description="كشف النصوص المكتوبة بواسطة الذكاء الاصطناعي",
323
+ version="2.0.0"
324
+ )
325
+
326
+ # إضافة CORS للسماح بالاستخدام من المتصفح
327
+ app.add_middleware(
328
+ CORSMiddleware,
329
+ allow_origins=["*"],
330
+ allow_credentials=True,
331
+ allow_methods=["*"],
332
+ allow_headers=["*"],
333
+ )
334
+
335
+ # إنشاء مدير الموديلات
336
+ model_manager = ModelManager()
337
 
338
+ # =====================================================
339
+ # 📝 نماذج البيانات (Pydantic Models)
340
+ # =====================================================
341
+ class TextInput(BaseModel):
342
+ text: str
343
+ analyze_paragraphs: Optional[bool] = False
344
 
345
+ class SimpleTextInput(BaseModel):
346
+ text: str
 
 
 
347
 
348
+ class DetectionResult(BaseModel):
349
+ success: bool
350
+ code: int
351
+ message: str
352
+ data: Dict
353
 
354
+ # =====================================================
355
+ # 🎯 API Endpoints
356
+ # =====================================================
357
+ @app.on_event("startup")
358
+ async def startup_event():
359
+ """تحميل الموديلات عند بداية التشغيل"""
360
+ logger.info("=" * 50)
361
+ logger.info("🚀 Starting ModernBERT AI Detector...")
362
+ logger.info(f"🐍 Python version: {sys.version}")
363
+ logger.info(f"🔥 PyTorch version: {torch.__version__}")
364
+ logger.info("=" * 50)
365
+
366
+ # محاولة تحميل الموديلات
367
+ max_models = int(os.environ.get("MAX_MODELS", "2"))
368
+ success = model_manager.load_models(max_models=max_models)
369
+
370
+ if success:
371
+ logger.info("✅ Application ready!")
372
+ else:
373
+ logger.error("⚠️ Failed to load models - API will return errors")
374
 
375
+ @app.get("/")
376
+ async def root():
377
+ """الصفحة الرئيسية"""
378
+ return {
379
+ "message": "ModernBERT AI Text Detector API",
380
+ "status": "online" if model_manager.models_loaded else "initializing",
381
+ "models_loaded": len(model_manager.models),
382
+ "device": str(device),
383
+ "endpoints": {
384
+ "analyze": "/analyze",
385
+ "simple": "/analyze-simple",
386
+ "health": "/health",
387
+ "docs": "/docs"
388
+ }
389
+ }
390
 
391
+ @app.get("/health")
392
+ async def health_check():
393
+ """فحص صحة الخدمة"""
394
+ memory_info = {}
395
+ if torch.cuda.is_available():
396
+ memory_info = {
397
+ "gpu_allocated_gb": round(torch.cuda.memory_allocated() / 1024**3, 2),
398
+ "gpu_reserved_gb": round(torch.cuda.memory_reserved() / 1024**3, 2)
399
+ }
400
+
401
  return {
402
+ "status": "healthy" if model_manager.models_loaded else "unhealthy",
403
+ "models_loaded": len(model_manager.models),
404
+ "device": str(device),
405
+ "cuda_available": torch.cuda.is_available(),
406
+ "memory_info": memory_info
407
+ }
408
+
409
+ @app.post("/analyze", response_model=DetectionResult)
410
+ async def analyze_text(data: TextInput):
411
+ """
412
+ تحليل النص للكشف عن AI
413
+ يحاكي نفس وظيفة Gradio classify_text
414
+ """
415
+ try:
416
+ # التحقق من النص
417
+ text = data.text.strip()
418
+ if not text:
419
+ return DetectionResult(
420
+ success=False,
421
+ code=400,
422
+ message="Empty input text",
423
+ data={}
424
+ )
425
+
426
+ # التأكد من تحميل الموديلات
427
+ if not model_manager.models_loaded:
428
+ # محاولة تحميل الموديلات
429
+ if not model_manager.load_models():
430
+ return DetectionResult(
431
+ success=False,
432
+ code=503,
433
+ message="Models not available",
434
+ data={}
435
+ )
436
+
437
+ # حساب عدد الكلمات
438
+ total_words = len(text.split())
439
+
440
+ # التحليل الأساسي
441
+ result = model_manager.classify_text(text)
442
+
443
+ # النتائج الأساسية
444
+ ai_percentage = result["ai_percentage"]
445
+ human_percentage = result["human_percentage"]
446
+ ai_words = int(total_words * (ai_percentage / 100))
447
+
448
+ # تحليل الفقرات إذا طُلب ذلك
449
+ paragraphs_analysis = []
450
+ if data.analyze_paragraphs and ai_percentage > 50:
451
+ paragraphs = split_into_paragraphs(text)
452
+ recalc_ai_words = 0
453
+ recalc_total_words = 0
454
+
455
+ for para in paragraphs[:10]: # حد أقصى 10 فقرات
456
+ if para.strip():
457
+ try:
458
+ para_result = model_manager.classify_text(para)
459
+ para_words = len(para.split())
460
+ recalc_total_words += para_words
461
+ recalc_ai_words += para_words * (para_result["ai_percentage"] / 100)
462
+
463
+ paragraphs_analysis.append({
464
+ "paragraph": para[:200] + "..." if len(para) > 200 else para,
465
+ "ai_generated_score": para_result["ai_percentage"] / 100,
466
+ "human_written_score": para_result["human_percentage"] / 100,
467
+ "predicted_model": para_result["predicted_model"]
468
+ })
469
+ except Exception as e:
470
+ logger.warning(f"Failed to analyze paragraph: {e}")
471
+
472
+ # إعادة حساب النسب بناءً على الفقرات
473
+ if recalc_total_words > 0:
474
+ ai_percentage = round((recalc_ai_words / recalc_total_words) * 100, 2)
475
+ human_percentage = round(100 - ai_percentage, 2)
476
+ ai_words = int(recalc_ai_words)
477
+
478
+ # إنشاء رسالة التغذية الراجعة
479
+ if ai_percentage > 50:
480
+ feedback = "Most of Your Text is AI/GPT Generated"
481
+ else:
482
+ feedback = "Most of Your Text Appears Human-Written"
483
+
484
+ # إرجاع النتائج بنفس تنسيق الكود الأصلي
485
+ return DetectionResult(
486
+ success=True,
487
+ code=200,
488
+ message="analysis completed",
489
+ data={
490
+ "fakePercentage": ai_percentage,
491
+ "isHuman": human_percentage,
492
+ "textWords": total_words,
493
+ "aiWords": ai_words,
494
+ "paragraphs": paragraphs_analysis,
495
+ "predicted_model": result["predicted_model"],
496
+ "feedback": feedback,
497
+ "input_text": text[:500] + "..." if len(text) > 500 else text,
498
+ "detected_language": "en",
499
+ "top_5_predictions": result.get("top_5_predictions", []),
500
+ "models_used": result.get("models_used", 1)
501
+ }
502
+ )
503
+
504
+ except Exception as e:
505
+ logger.error(f"Analysis error: {e}", exc_info=True)
506
+ return DetectionResult(
507
+ success=False,
508
+ code=500,
509
+ message=f"Analysis failed: {str(e)}",
510
+ data={}
511
+ )
512
+
513
+ @app.post("/analyze-simple")
514
+ async def analyze_simple(data: SimpleTextInput):
515
+ """
516
+ تحليل مبسط - يرجع النتائج الأساسية فقط
517
+ """
518
+ try:
519
+ text = data.text.strip()
520
+ if not text:
521
+ raise HTTPException(status_code=400, detail="Empty text")
522
+
523
+ if not model_manager.models_loaded:
524
+ if not model_manager.load_models():
525
+ raise HTTPException(status_code=503, detail="Models not available")
526
+
527
+ result = model_manager.classify_text(text)
528
+
529
+ return {
530
+ "is_ai": result["ai_percentage"] > 50,
531
+ "ai_score": result["ai_percentage"],
532
+ "human_score": result["human_percentage"],
533
+ "detected_model": result["predicted_model"] if result["ai_percentage"] > 50 else None,
534
+ "confidence": max(result["ai_percentage"], result["human_percentage"])
535
  }
536
+
537
+ except HTTPException:
538
+ raise
539
+ except Exception as e:
540
+ logger.error(f"Simple analysis error: {e}")
541
+ raise HTTPException(status_code=500, detail=str(e))
542
+
543
+ # =====================================================
544
+ # 🏃 تشغيل التطبيق
545
+ # =====================================================
546
+ if __name__ == "__main__":
547
+ import uvicorn
548
+
549
+ # الحصول على الإعدادات من البيئة
550
+ port = int(os.environ.get("PORT", 8000))
551
+ host = os.environ.get("HOST", "0.0.0.0")
552
+ workers = int(os.environ.get("WORKERS", 1))
553
+
554
+ logger.info("=" * 50)
555
+ logger.info(f"🌐 Starting server on {host}:{port}")
556
+ logger.info(f"👷 Workers: {workers}")
557
+ logger.info(f"📚 Documentation: http://{host}:{port}/docs")
558
+ logger.info("=" * 50)
559
+