mahmoudsaber0 commited on
Commit
31a862d
·
verified ·
1 Parent(s): 18e2e5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -68
app.py CHANGED
@@ -4,6 +4,7 @@ import torch
4
  import logging
5
  import gc
6
  import sys
 
7
  from fastapi import FastAPI, HTTPException
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from pydantic import BaseModel
@@ -11,6 +12,35 @@ from typing import Dict, List, Optional
11
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
  from tokenizers.normalizers import Sequence, Replace, Strip
13
  from tokenizers import Regex
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # =====================================================
16
  # 🔧 تكوين البيئة والإعدادات
@@ -78,93 +108,133 @@ class ModelManager:
78
  "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
79
  "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
80
  ]
 
 
 
81
 
82
  def load_tokenizer(self):
83
- """تحميل الـ Tokenizer مع معالجة الأخطاء"""
84
  try:
85
- logger.info("📝 Loading tokenizer...")
86
  self.tokenizer = AutoTokenizer.from_pretrained(
87
- "answerdotai/ModernBERT-base",
88
  cache_dir=CACHE_DIR,
89
  use_fast=True,
90
  trust_remote_code=False
91
  )
 
92
 
93
- # إعداد معالج النصوص
 
94
  try:
95
- newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
96
- join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
97
- self.tokenizer.backend_tokenizer.normalizer = Sequence([
98
- self.tokenizer.backend_tokenizer.normalizer,
99
- join_hyphen_break,
100
- newline_to_space,
101
- Strip()
102
- ])
103
- except Exception as e:
104
- logger.warning(f"⚠️ Could not set custom normalizer: {e}")
105
-
106
- logger.info("✅ Tokenizer loaded successfully")
107
- return True
108
-
 
 
 
 
 
 
 
 
 
109
  except Exception as e:
110
- logger.error(f" Failed to load tokenizer: {e}")
111
- return False
 
112
 
113
  def load_single_model(self, model_url=None, model_path=None, model_name="Model"):
114
- """تحميل موديل واحد مع معالجة شاملة للأخطاء"""
 
115
  try:
116
- logger.info(f"🤖 Loading {model_name}...")
117
 
118
- # إنشاء الموديل الأساسي
119
  base_model = AutoModelForSequenceClassification.from_pretrained(
120
- "answerdotai/ModernBERT-base",
121
  num_labels=41,
122
  cache_dir=CACHE_DIR,
123
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
124
  low_cpu_mem_usage=True,
125
  trust_remote_code=False
126
  )
 
127
 
128
- # محاولة تحميل الأوزان
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  if model_path and os.path.exists(model_path):
130
  logger.info(f"📁 Loading from local file: {model_path}")
131
  state_dict = torch.load(model_path, map_location=device, weights_only=True)
132
  base_model.load_state_dict(state_dict, strict=False)
133
  elif model_url:
134
- logger.info(f"🌐 Downloading weights from: {model_url}")
135
- try:
136
- state_dict = torch.hub.load_state_dict_from_url(
137
- model_url,
138
- map_location=device,
139
- progress=True,
140
- check_hash=False,
141
- file_name=f"{model_name}.pt"
142
- )
 
 
 
 
 
143
  base_model.load_state_dict(state_dict, strict=False)
144
- except Exception as url_error:
145
- logger.warning(f"⚠️ Could not load weights from URL: {url_error}")
146
- logger.info("📊 Using model with random initialization")
147
  else:
148
  logger.info("📊 Using model with random initialization")
149
-
150
- # نقل الموديل للجهاز المناسب
151
- model = base_model.to(device)
152
- model.eval()
153
-
154
- # تنظيف الذاكرة
155
- if 'state_dict' in locals():
156
- del state_dict
157
- gc.collect()
158
- if torch.cuda.is_available():
159
- torch.cuda.empty_cache()
160
-
161
- logger.info(f"✅ {model_name} loaded successfully")
162
- return model
163
-
164
- except Exception as e:
165
- logger.error(f"❌ Failed to load {model_name}: {e}")
166
- return None
167
-
168
  def load_models(self, max_models=2):
169
  """تحميل الموديلات بحد أقصى للذاكرة"""
170
  if self.models_loaded:
@@ -173,6 +243,7 @@ class ModelManager:
173
 
174
  # تحميل الـ Tokenizer أولاً
175
  if not self.load_tokenizer():
 
176
  return False
177
 
178
  # تحميل الموديلات
@@ -188,13 +259,14 @@ class ModelManager:
188
  if model is not None:
189
  self.models.append(model)
190
 
191
- # تحميل الموديلات من URLs
192
- for i, url in enumerate(self.model_urls[:max_models - len(self.models)]):
193
  if len(self.models) >= max_models:
194
  break
195
 
 
196
  model = self.load_single_model(
197
- model_url=url,
198
  model_name=f"Model {len(self.models) + 1}"
199
  )
200
  if model is not None:
@@ -214,7 +286,7 @@ class ModelManager:
214
  # التحقق من نجاح التحميل
215
  if len(self.models) > 0:
216
  self.models_loaded = True
217
- logger.info(f"✅ Successfully loaded {len(self.models)} models")
218
  return True
219
  else:
220
  logger.error("❌ No models could be loaded")
@@ -230,13 +302,14 @@ class ModelManager:
230
  if not cleaned_text.strip():
231
  raise ValueError("Empty text after cleaning")
232
 
233
- # Tokenization
 
234
  try:
235
  inputs = self.tokenizer(
236
  cleaned_text,
237
  return_tensors="pt",
238
  truncation=True,
239
- max_length=512,
240
  padding=True
241
  ).to(device)
242
  except Exception as e:
@@ -297,7 +370,8 @@ class ModelManager:
297
  "predicted_model": predicted_model,
298
  "top_5_predictions": top_5_results,
299
  "is_human": human_percentage > ai_percentage,
300
- "models_used": len(all_probabilities)
 
301
  }
302
 
303
  # =====================================================
@@ -320,7 +394,7 @@ def split_into_paragraphs(text: str) -> List[str]:
320
  app = FastAPI(
321
  title="ModernBERT AI Text Detector",
322
  description="كشف النصوص المكتوبة بواسطة الذكاء الاصطناعي",
323
- version="2.0.0"
324
  )
325
 
326
  # إضافة CORS للسماح بالاستخدام من المتصفح
@@ -361,6 +435,9 @@ async def startup_event():
361
  logger.info("🚀 Starting ModernBERT AI Detector...")
362
  logger.info(f"🐍 Python version: {sys.version}")
363
  logger.info(f"🔥 PyTorch version: {torch.__version__}")
 
 
 
364
  logger.info("=" * 50)
365
 
366
  # محاولة تحميل الموديلات
@@ -368,9 +445,10 @@ async def startup_event():
368
  success = model_manager.load_models(max_models=max_models)
369
 
370
  if success:
371
- logger.info("✅ Application ready!")
372
  else:
373
  logger.error("⚠️ Failed to load models - API will return errors")
 
374
 
375
  @app.get("/")
376
  async def root():
@@ -379,6 +457,7 @@ async def root():
379
  "message": "ModernBERT AI Text Detector API",
380
  "status": "online" if model_manager.models_loaded else "initializing",
381
  "models_loaded": len(model_manager.models),
 
382
  "device": str(device),
383
  "endpoints": {
384
  "analyze": "/analyze",
@@ -401,6 +480,7 @@ async def health_check():
401
  return {
402
  "status": "healthy" if model_manager.models_loaded else "unhealthy",
403
  "models_loaded": len(model_manager.models),
 
404
  "device": str(device),
405
  "cuda_available": torch.cuda.is_available(),
406
  "memory_info": memory_info
@@ -430,7 +510,7 @@ async def analyze_text(data: TextInput):
430
  return DetectionResult(
431
  success=False,
432
  code=503,
433
- message="Models not available",
434
  data={}
435
  )
436
 
@@ -497,7 +577,8 @@ async def analyze_text(data: TextInput):
497
  "input_text": text[:500] + "..." if len(text) > 500 else text,
498
  "detected_language": "en",
499
  "top_5_predictions": result.get("top_5_predictions", []),
500
- "models_used": result.get("models_used", 1)
 
501
  }
502
  )
503
 
@@ -531,7 +612,8 @@ async def analyze_simple(data: SimpleTextInput):
531
  "ai_score": result["ai_percentage"],
532
  "human_score": result["human_percentage"],
533
  "detected_model": result["predicted_model"] if result["ai_percentage"] > 50 else None,
534
- "confidence": max(result["ai_percentage"], result["human_percentage"])
 
535
  }
536
 
537
  except HTTPException:
@@ -557,3 +639,10 @@ if __name__ == "__main__":
557
  logger.info(f"📚 Documentation: http://{host}:{port}/docs")
558
  logger.info("=" * 50)
559
 
 
 
 
 
 
 
 
 
4
  import logging
5
  import gc
6
  import sys
7
+ import pwd # Added for monkey patch
8
  from fastapi import FastAPI, HTTPException
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from pydantic import BaseModel
 
12
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
13
  from tokenizers.normalizers import Sequence, Replace, Strip
14
  from tokenizers import Regex
15
+ from huggingface_hub import hf_hub_download # Added for reliable HF downloads
16
+
17
+ # =====================================================
18
+ # 🛠️ Monkey Patch for Docker/Container UID Issue
19
+ # =====================================================
20
+ # Fix for 'getpwuid(): uid not found: 1000' in containerized environments
21
+ def patched_getpwuid(uid_num):
22
+ try:
23
+ return original_getpwuid(uid_num)
24
+ except KeyError:
25
+ if uid_num == os.getuid():
26
+ # Create fake user entry
27
+ return pwd.struct_pwent(
28
+ name='dockeruser',
29
+ passwd='x',
30
+ uid=uid_num,
31
+ gid=os.getgid(),
32
+ gecos='Docker User',
33
+ dir='/tmp',
34
+ shell='/bin/sh'
35
+ )
36
+ raise
37
+
38
+ original_getpwuid = pwd.getpwuid
39
+ pwd.getpwuid = patched_getpwuid
40
+
41
+ # Set fallback env vars to avoid user-dependent paths
42
+ os.environ.setdefault('HOME', '/tmp')
43
+ os.environ.setdefault('USER', 'dockeruser')
44
 
45
  # =====================================================
46
  # 🔧 تكوين البيئة والإعدادات
 
108
  "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
109
  "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
110
  ]
111
+ self.base_model_id = "answerdotai/ModernBERT-base" # Primary
112
+ self.fallback_model_id = "bert-base-uncased" # Fallback if ModernBERT fails
113
+ self.using_fallback = False
114
 
115
  def load_tokenizer(self):
116
+ """تحميل الـ Tokenizer مع fallback"""
117
  try:
118
+ logger.info(f"📝 Loading tokenizer from {self.base_model_id}...")
119
  self.tokenizer = AutoTokenizer.from_pretrained(
120
+ self.base_model_id,
121
  cache_dir=CACHE_DIR,
122
  use_fast=True,
123
  trust_remote_code=False
124
  )
125
+ logger.info("✅ Primary tokenizer loaded successfully")
126
 
127
+ except Exception as e:
128
+ logger.warning(f"⚠️ Failed to load primary tokenizer: {e}")
129
  try:
130
+ logger.info(f"🔄 Falling back to {self.fallback_model_id}...")
131
+ self.tokenizer = AutoTokenizer.from_pretrained(
132
+ self.fallback_model_id,
133
+ cache_dir=CACHE_DIR,
134
+ use_fast=True,
135
+ trust_remote_code=False
136
+ )
137
+ self.using_fallback = True
138
+ logger.info("✅ Fallback tokenizer loaded successfully")
139
+ except Exception as fallback_e:
140
+ logger.error(f"❌ Failed to load fallback tokenizer: {fallback_e}")
141
+ return False
142
+
143
+ # إعداد معالج النصوص
144
+ try:
145
+ newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
146
+ join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
147
+ self.tokenizer.backend_tokenizer.normalizer = Sequence([
148
+ self.tokenizer.backend_tokenizer.normalizer,
149
+ join_hyphen_break,
150
+ newline_to_space,
151
+ Strip()
152
+ ])
153
  except Exception as e:
154
+ logger.warning(f"⚠️ Could not set custom normalizer: {e}")
155
+
156
+ return True
157
 
158
  def load_single_model(self, model_url=None, model_path=None, model_name="Model"):
159
+ """تحميل موديل واحد مع fallback ومعالجة شامل�� للأخطاء"""
160
+ base_model = None
161
  try:
162
+ logger.info(f"🤖 Loading base {model_name} from {self.base_model_id}...")
163
 
164
+ # محاولة تحميل الموديل الأساسي الرئيسي
165
  base_model = AutoModelForSequenceClassification.from_pretrained(
166
+ self.base_model_id,
167
  num_labels=41,
168
  cache_dir=CACHE_DIR,
169
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
170
  low_cpu_mem_usage=True,
171
  trust_remote_code=False
172
  )
173
+ logger.info("✅ Primary base model loaded")
174
 
175
+ except Exception as e:
176
+ logger.warning(f"⚠️ Failed to load primary base model: {e}")
177
+ try:
178
+ logger.info(f"🔄 Falling back to {self.fallback_model_id}...")
179
+ base_model = AutoModelForSequenceClassification.from_pretrained(
180
+ self.fallback_model_id,
181
+ num_labels=41,
182
+ cache_dir=CACHE_DIR,
183
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
184
+ low_cpu_mem_usage=True,
185
+ trust_remote_code=False
186
+ )
187
+ self.using_fallback = True
188
+ logger.info("✅ Fallback base model loaded (note: weights may not be compatible)")
189
+ except Exception as fallback_e:
190
+ logger.error(f"❌ Failed to load fallback base model: {fallback_e}")
191
+ return None
192
+
193
+ # محاولة تحميل الأوزان (فقط إذا لم نستخدم fallback، أو إذا كانت متوافقة)
194
+ try:
195
  if model_path and os.path.exists(model_path):
196
  logger.info(f"📁 Loading from local file: {model_path}")
197
  state_dict = torch.load(model_path, map_location=device, weights_only=True)
198
  base_model.load_state_dict(state_dict, strict=False)
199
  elif model_url:
200
+ # استخدام hf_hub_download بدلاً من torch.hub للـ HF repos
201
+ logger.info(f"🌐 Downloading weights from HF repo...")
202
+ repo_id = "mihalykiss/modernbert_2"
203
+ filename = model_url.split('/')[-1] # Extract filename like "Model_groups_3class_seed12"
204
+ pt_file = hf_hub_download(
205
+ repo_id=repo_id,
206
+ filename=filename,
207
+ cache_dir=CACHE_DIR,
208
+ local_dir_use_symlinks=False
209
+ )
210
+ state_dict = torch.load(pt_file, map_location=device, weights_only=True)
211
+
212
+ # تحميل الأوزان فقط إذا لم نكن في وضع fallback (لأن ModernBERT weights قد لا تتوافق مع BERT القياسي)
213
+ if not self.using_fallback:
214
  base_model.load_state_dict(state_dict, strict=False)
215
+ logger.info("✅ Weights loaded successfully")
216
+ else:
217
+ logger.warning("⚠️ Skipping weight load in fallback mode (incompatible architecture)")
218
  else:
219
  logger.info("📊 Using model with random initialization")
220
+ except Exception as weight_error:
221
+ logger.warning(f"⚠️ Could not load weights: {weight_error}")
222
+ logger.info("📊 Continuing with base model (random or pre-trained init)")
223
+
224
+ # نقل الموديل للجهاز المناسب
225
+ model = base_model.to(device)
226
+ model.eval()
227
+
228
+ # تنظيف الذاكرة
229
+ if 'state_dict' in locals():
230
+ del state_dict
231
+ gc.collect()
232
+ if torch.cuda.is_available():
233
+ torch.cuda.empty_cache()
234
+
235
+ logger.info(f"✅ {model_name} loaded successfully (fallback: {self.using_fallback})")
236
+ return model
237
+
 
238
  def load_models(self, max_models=2):
239
  """تحميل الموديلات بحد أقصى للذاكرة"""
240
  if self.models_loaded:
 
243
 
244
  # تحميل الـ Tokenizer أولاً
245
  if not self.load_tokenizer():
246
+ logger.error("❌ Tokenizer load failed - cannot proceed")
247
  return False
248
 
249
  # تحميل الموديلات
 
259
  if model is not None:
260
  self.models.append(model)
261
 
262
+ # تحميل الموديلات من URLs (استخراج filenames)
263
+ for i, full_url in enumerate(self.model_urls[:max_models - len(self.models)]):
264
  if len(self.models) >= max_models:
265
  break
266
 
267
+ # استخدام full_url كما هو، لكن في load_single_model نستخرج filename
268
  model = self.load_single_model(
269
+ model_url=full_url,
270
  model_name=f"Model {len(self.models) + 1}"
271
  )
272
  if model is not None:
 
286
  # التحقق من نجاح التحميل
287
  if len(self.models) > 0:
288
  self.models_loaded = True
289
+ logger.info(f"✅ Successfully loaded {len(self.models)} models (using fallback: {self.using_fallback})")
290
  return True
291
  else:
292
  logger.error("❌ No models could be loaded")
 
302
  if not cleaned_text.strip():
303
  raise ValueError("Empty text after cleaning")
304
 
305
+ # Tokenization (max_length adjusted for fallback BERT if needed)
306
+ max_len = 512 if not self.using_fallback else 512 # BERT max is 512
307
  try:
308
  inputs = self.tokenizer(
309
  cleaned_text,
310
  return_tensors="pt",
311
  truncation=True,
312
+ max_length=max_len,
313
  padding=True
314
  ).to(device)
315
  except Exception as e:
 
370
  "predicted_model": predicted_model,
371
  "top_5_predictions": top_5_results,
372
  "is_human": human_percentage > ai_percentage,
373
+ "models_used": len(all_probabilities),
374
+ "using_fallback": self.using_fallback
375
  }
376
 
377
  # =====================================================
 
394
  app = FastAPI(
395
  title="ModernBERT AI Text Detector",
396
  description="كشف النصوص المكتوبة بواسطة الذكاء الاصطناعي",
397
+ version="2.2.0" # Updated version with UID fix
398
  )
399
 
400
  # إضافة CORS للسماح بالاستخدام من المتصفح
 
435
  logger.info("🚀 Starting ModernBERT AI Detector...")
436
  logger.info(f"🐍 Python version: {sys.version}")
437
  logger.info(f"🔥 PyTorch version: {torch.__version__}")
438
+ import transformers
439
+ logger.info(f"🔧 Transformers version: {transformers.__version__}")
440
+ logger.info("🛡️ UID Monkey Patch Applied (for Docker/Container)")
441
  logger.info("=" * 50)
442
 
443
  # محاولة تحميل الموديلات
 
445
  success = model_manager.load_models(max_models=max_models)
446
 
447
  if success:
448
+ logger.info("✅ Application ready! (Fallback mode: %s)", model_manager.using_fallback)
449
  else:
450
  logger.error("⚠️ Failed to load models - API will return errors")
451
+ logger.info("💡 Tip: Ensure 'transformers>=4.45.0' and 'huggingface_hub' are installed. Run: pip install --upgrade transformers huggingface_hub")
452
 
453
  @app.get("/")
454
  async def root():
 
457
  "message": "ModernBERT AI Text Detector API",
458
  "status": "online" if model_manager.models_loaded else "initializing",
459
  "models_loaded": len(model_manager.models),
460
+ "using_fallback": model_manager.using_fallback,
461
  "device": str(device),
462
  "endpoints": {
463
  "analyze": "/analyze",
 
480
  return {
481
  "status": "healthy" if model_manager.models_loaded else "unhealthy",
482
  "models_loaded": len(model_manager.models),
483
+ "using_fallback": model_manager.using_fallback,
484
  "device": str(device),
485
  "cuda_available": torch.cuda.is_available(),
486
  "memory_info": memory_info
 
510
  return DetectionResult(
511
  success=False,
512
  code=503,
513
+ message="Models not available. Check logs for details.",
514
  data={}
515
  )
516
 
 
577
  "input_text": text[:500] + "..." if len(text) > 500 else text,
578
  "detected_language": "en",
579
  "top_5_predictions": result.get("top_5_predictions", []),
580
+ "models_used": result.get("models_used", 1),
581
+ "using_fallback": result.get("using_fallback", False)
582
  }
583
  )
584
 
 
612
  "ai_score": result["ai_percentage"],
613
  "human_score": result["human_percentage"],
614
  "detected_model": result["predicted_model"] if result["ai_percentage"] > 50 else None,
615
+ "confidence": max(result["ai_percentage"], result["human_percentage"]),
616
+ "using_fallback": result.get("using_fallback", False)
617
  }
618
 
619
  except HTTPException:
 
639
  logger.info(f"📚 Documentation: http://{host}:{port}/docs")
640
  logger.info("=" * 50)
641
 
642
+ uvicorn.run(
643
+ "main:app", # Assuming this file is named main.py
644
+ host=host,
645
+ port=port,
646
+ workers=workers,
647
+ reload=False # Set to True for dev
648
+ )