AI_Detector

Sleeping

App Files Files Community

mahmoudsaber0 commited on Oct 19

Commit

31a862d

verified ·

1 Parent(s): 18e2e5a

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -68

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 import logging
 import gc
 import sys
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
@@ -11,6 +12,35 @@ from typing import Dict, List, Optional
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from tokenizers.normalizers import Sequence, Replace, Strip
 from tokenizers import Regex
 # =====================================================
 # 🔧 تكوين البيئة والإعدادات
@@ -78,93 +108,133 @@ class ModelManager:
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
         ]
     def load_tokenizer(self):
-        """تحميل الـ Tokenizer مع معالجة الأخطاء"""
         try:
-            logger.info("📝 Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(
-                "answerdotai/ModernBERT-base",
                 cache_dir=CACHE_DIR,
                 use_fast=True,
                 trust_remote_code=False
             )
-            # إعداد معالج النصوص
             try:
-                newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
-                join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
-                self.tokenizer.backend_tokenizer.normalizer = Sequence([
-                    self.tokenizer.backend_tokenizer.normalizer,
-                    join_hyphen_break,
-                    newline_to_space,
-                    Strip()
-                ])
-            except Exception as e:
-                logger.warning(f"⚠️ Could not set custom normalizer: {e}")
-            logger.info("✅ Tokenizer loaded successfully")
-            return True
         except Exception as e:
-            logger.error(f"❌ Failed to load tokenizer: {e}")
-            return False
     def load_single_model(self, model_url=None, model_path=None, model_name="Model"):
-        """تحميل موديل واحد مع معالجة شاملة للأخطاء"""
         try:
-            logger.info(f"🤖 Loading {model_name}...")
-            # إنشاء الموديل الأساسي
             base_model = AutoModelForSequenceClassification.from_pretrained(
-                "answerdotai/ModernBERT-base",
                 num_labels=41,
                 cache_dir=CACHE_DIR,
                 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                 low_cpu_mem_usage=True,
                 trust_remote_code=False
             )
-            # محاولة تحميل الأوزان
             if model_path and os.path.exists(model_path):
                 logger.info(f"📁 Loading from local file: {model_path}")
                 state_dict = torch.load(model_path, map_location=device, weights_only=True)
                 base_model.load_state_dict(state_dict, strict=False)
             elif model_url:
-                logger.info(f"🌐 Downloading weights from: {model_url}")
-                try:
-                    state_dict = torch.hub.load_state_dict_from_url(
-                        model_url,
-                        map_location=device,
-                        progress=True,
-                        check_hash=False,
-                        file_name=f"{model_name}.pt"
-                    )
                     base_model.load_state_dict(state_dict, strict=False)
-                except Exception as url_error:
-                    logger.warning(f"⚠️ Could not load weights from URL: {url_error}")
-                    logger.info("📊 Using model with random initialization")
             else:
                 logger.info("📊 Using model with random initialization")
-            # نقل الموديل للجهاز المناسب
-            model = base_model.to(device)
-            model.eval()
-            # تنظيف الذاكرة
-            if 'state_dict' in locals():
-                del state_dict
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            logger.info(f"✅ {model_name} loaded successfully")
-            return model
-        except Exception as e:
-            logger.error(f"❌ Failed to load {model_name}: {e}")
-            return None
     def load_models(self, max_models=2):
         """تحميل الموديلات بحد أقصى للذاكرة"""
         if self.models_loaded:
@@ -173,6 +243,7 @@ class ModelManager:
         # تحميل الـ Tokenizer أولاً
         if not self.load_tokenizer():
             return False
         # تحميل الموديلات
@@ -188,13 +259,14 @@ class ModelManager:
             if model is not None:
                 self.models.append(model)
-        # تحميل الموديلات من URLs
-        for i, url in enumerate(self.model_urls[:max_models - len(self.models)]):
             if len(self.models) >= max_models:
                 break
             model = self.load_single_model(
-                model_url=url,
                 model_name=f"Model {len(self.models) + 1}"
             )
             if model is not None:
@@ -214,7 +286,7 @@ class ModelManager:
         # التحقق من نجاح التحميل
         if len(self.models) > 0:
             self.models_loaded = True
-            logger.info(f"✅ Successfully loaded {len(self.models)} models")
             return True
         else:
             logger.error("❌ No models could be loaded")
@@ -230,13 +302,14 @@ class ModelManager:
         if not cleaned_text.strip():
             raise ValueError("Empty text after cleaning")
-        # Tokenization
         try:
             inputs = self.tokenizer(
                 cleaned_text,
                 return_tensors="pt",
                 truncation=True,
-                max_length=512,
                 padding=True
             ).to(device)
         except Exception as e:
@@ -297,7 +370,8 @@ class ModelManager:
             "predicted_model": predicted_model,
             "top_5_predictions": top_5_results,
             "is_human": human_percentage > ai_percentage,
-            "models_used": len(all_probabilities)
         }
 # =====================================================
@@ -320,7 +394,7 @@ def split_into_paragraphs(text: str) -> List[str]:
 app = FastAPI(
     title="ModernBERT AI Text Detector",
     description="كشف النصوص المكتوبة بواسطة الذكاء الاصطناعي",
-    version="2.0.0"
 )
 # إضافة CORS للسماح بالاستخدام من المتصفح
@@ -361,6 +435,9 @@ async def startup_event():
     logger.info("🚀 Starting ModernBERT AI Detector...")
     logger.info(f"🐍 Python version: {sys.version}")
     logger.info(f"🔥 PyTorch version: {torch.__version__}")
     logger.info("=" * 50)
     # محاولة تحميل الموديلات
@@ -368,9 +445,10 @@ async def startup_event():
     success = model_manager.load_models(max_models=max_models)
     if success:
-        logger.info("✅ Application ready!")
     else:
         logger.error("⚠️ Failed to load models - API will return errors")
 @app.get("/")
 async def root():
@@ -379,6 +457,7 @@ async def root():
         "message": "ModernBERT AI Text Detector API",
         "status": "online" if model_manager.models_loaded else "initializing",
         "models_loaded": len(model_manager.models),
         "device": str(device),
         "endpoints": {
             "analyze": "/analyze",
@@ -401,6 +480,7 @@ async def health_check():
     return {
         "status": "healthy" if model_manager.models_loaded else "unhealthy",
         "models_loaded": len(model_manager.models),
         "device": str(device),
         "cuda_available": torch.cuda.is_available(),
         "memory_info": memory_info
@@ -430,7 +510,7 @@ async def analyze_text(data: TextInput):
                 return DetectionResult(
                     success=False,
                     code=503,
-                    message="Models not available",
                     data={}
                 )
@@ -497,7 +577,8 @@ async def analyze_text(data: TextInput):
                 "input_text": text[:500] + "..." if len(text) > 500 else text,
                 "detected_language": "en",
                 "top_5_predictions": result.get("top_5_predictions", []),
-                "models_used": result.get("models_used", 1)
             }
         )
@@ -531,7 +612,8 @@ async def analyze_simple(data: SimpleTextInput):
             "ai_score": result["ai_percentage"],
             "human_score": result["human_percentage"],
             "detected_model": result["predicted_model"] if result["ai_percentage"] > 50 else None,
-            "confidence": max(result["ai_percentage"], result["human_percentage"])
         }
     except HTTPException:
@@ -557,3 +639,10 @@ if __name__ == "__main__":
     logger.info(f"📚 Documentation: http://{host}:{port}/docs")
     logger.info("=" * 50)

 import logging
 import gc
 import sys
+import pwd  # Added for monkey patch
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from tokenizers.normalizers import Sequence, Replace, Strip
 from tokenizers import Regex
+from huggingface_hub import hf_hub_download  # Added for reliable HF downloads
+# =====================================================
+# 🛠️ Monkey Patch for Docker/Container UID Issue
+# =====================================================
+# Fix for 'getpwuid(): uid not found: 1000' in containerized environments
+def patched_getpwuid(uid_num):
+    try:
+        return original_getpwuid(uid_num)
+    except KeyError:
+        if uid_num == os.getuid():
+            # Create fake user entry
+            return pwd.struct_pwent(
+                name='dockeruser',
+                passwd='x',
+                uid=uid_num,
+                gid=os.getgid(),
+                gecos='Docker User',
+                dir='/tmp',
+                shell='/bin/sh'
+            )
+        raise
+original_getpwuid = pwd.getpwuid
+pwd.getpwuid = patched_getpwuid
+# Set fallback env vars to avoid user-dependent paths
+os.environ.setdefault('HOME', '/tmp')
+os.environ.setdefault('USER', 'dockeruser')
 # =====================================================
 # 🔧 تكوين البيئة والإعدادات
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
         ]
+        self.base_model_id = "answerdotai/ModernBERT-base"  # Primary
+        self.fallback_model_id = "bert-base-uncased"  # Fallback if ModernBERT fails
+        self.using_fallback = False
     def load_tokenizer(self):
+        """تحميل الـ Tokenizer مع fallback"""
         try:
+            logger.info(f"📝 Loading tokenizer from {self.base_model_id}...")
             self.tokenizer = AutoTokenizer.from_pretrained(
+                self.base_model_id,
                 cache_dir=CACHE_DIR,
                 use_fast=True,
                 trust_remote_code=False
             )
+            logger.info("✅ Primary tokenizer loaded successfully")
+        except Exception as e:
+            logger.warning(f"⚠️ Failed to load primary tokenizer: {e}")
             try:
+                logger.info(f"🔄 Falling back to {self.fallback_model_id}...")
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    self.fallback_model_id,
+                    cache_dir=CACHE_DIR,
+                    use_fast=True,
+                    trust_remote_code=False
+                )
+                self.using_fallback = True
+                logger.info("✅ Fallback tokenizer loaded successfully")
+            except Exception as fallback_e:
+                logger.error(f"❌ Failed to load fallback tokenizer: {fallback_e}")
+                return False
+        # إعداد معالج النصوص
+        try:
+            newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
+            join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
+            self.tokenizer.backend_tokenizer.normalizer = Sequence([
+                self.tokenizer.backend_tokenizer.normalizer,
+                join_hyphen_break,
+                newline_to_space,
+                Strip()
+            ])
         except Exception as e:
+            logger.warning(f"⚠️ Could not set custom normalizer: {e}")
+        return True
     def load_single_model(self, model_url=None, model_path=None, model_name="Model"):
+        """تحميل موديل واحد مع fallback ومعالجة شامل�� للأخطاء"""
+        base_model = None
         try:
+            logger.info(f"🤖 Loading base {model_name} from {self.base_model_id}...")
+            # محاولة تحميل الموديل الأساسي الرئيسي
             base_model = AutoModelForSequenceClassification.from_pretrained(
+                self.base_model_id,
                 num_labels=41,
                 cache_dir=CACHE_DIR,
                 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                 low_cpu_mem_usage=True,
                 trust_remote_code=False
             )
+            logger.info("✅ Primary base model loaded")
+        except Exception as e:
+            logger.warning(f"⚠️ Failed to load primary base model: {e}")
+            try:
+                logger.info(f"🔄 Falling back to {self.fallback_model_id}...")
+                base_model = AutoModelForSequenceClassification.from_pretrained(
+                    self.fallback_model_id,
+                    num_labels=41,
+                    cache_dir=CACHE_DIR,
+                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                    low_cpu_mem_usage=True,
+                    trust_remote_code=False
+                )
+                self.using_fallback = True
+                logger.info("✅ Fallback base model loaded (note: weights may not be compatible)")
+            except Exception as fallback_e:
+                logger.error(f"❌ Failed to load fallback base model: {fallback_e}")
+                return None
+        # محاولة تحميل الأوزان (فقط إذا لم نستخدم fallback، أو إذا كانت متوافقة)
+        try:
             if model_path and os.path.exists(model_path):
                 logger.info(f"📁 Loading from local file: {model_path}")
                 state_dict = torch.load(model_path, map_location=device, weights_only=True)
                 base_model.load_state_dict(state_dict, strict=False)
             elif model_url:
+                # استخدام hf_hub_download بدلاً من torch.hub للـ HF repos
+                logger.info(f"🌐 Downloading weights from HF repo...")
+                repo_id = "mihalykiss/modernbert_2"
+                filename = model_url.split('/')[-1]  # Extract filename like "Model_groups_3class_seed12"
+                pt_file = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=filename,
+                    cache_dir=CACHE_DIR,
+                    local_dir_use_symlinks=False
+                )
+                state_dict = torch.load(pt_file, map_location=device, weights_only=True)
+                # تحميل الأوزان فقط إذا لم نكن في وضع fallback (لأن ModernBERT weights قد لا تتوافق مع BERT القياسي)
+                if not self.using_fallback:
                     base_model.load_state_dict(state_dict, strict=False)
+                    logger.info("✅ Weights loaded successfully")
+                else:
+                    logger.warning("⚠️ Skipping weight load in fallback mode (incompatible architecture)")
             else:
                 logger.info("📊 Using model with random initialization")
+        except Exception as weight_error:
+            logger.warning(f"⚠️ Could not load weights: {weight_error}")
+            logger.info("📊 Continuing with base model (random or pre-trained init)")
+        # نقل الموديل للجهاز المناسب
+        model = base_model.to(device)
+        model.eval()
+        # تنظيف الذاكرة
+        if 'state_dict' in locals():
+            del state_dict
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        logger.info(f"✅ {model_name} loaded successfully (fallback: {self.using_fallback})")
+        return model
     def load_models(self, max_models=2):
         """تحميل الموديلات بحد أقصى للذاكرة"""
         if self.models_loaded:
         # تحميل الـ Tokenizer أولاً
         if not self.load_tokenizer():
+            logger.error("❌ Tokenizer load failed - cannot proceed")
             return False
         # تحميل الموديلات
             if model is not None:
                 self.models.append(model)
+        # تحميل الموديلات من URLs (استخراج filenames)
+        for i, full_url in enumerate(self.model_urls[:max_models - len(self.models)]):
             if len(self.models) >= max_models:
                 break
+            # استخدام full_url كما هو، لكن في load_single_model نستخرج filename
             model = self.load_single_model(
+                model_url=full_url,
                 model_name=f"Model {len(self.models) + 1}"
             )
             if model is not None:
         # التحقق من نجاح التحميل
         if len(self.models) > 0:
             self.models_loaded = True
+            logger.info(f"✅ Successfully loaded {len(self.models)} models (using fallback: {self.using_fallback})")
             return True
         else:
             logger.error("❌ No models could be loaded")
         if not cleaned_text.strip():
             raise ValueError("Empty text after cleaning")
+        # Tokenization (max_length adjusted for fallback BERT if needed)
+        max_len = 512 if not self.using_fallback else 512  # BERT max is 512
         try:
             inputs = self.tokenizer(
                 cleaned_text,
                 return_tensors="pt",
                 truncation=True,
+                max_length=max_len,
                 padding=True
             ).to(device)
         except Exception as e:
             "predicted_model": predicted_model,
             "top_5_predictions": top_5_results,
             "is_human": human_percentage > ai_percentage,
+            "models_used": len(all_probabilities),
+            "using_fallback": self.using_fallback
         }
 # =====================================================
 app = FastAPI(
     title="ModernBERT AI Text Detector",
     description="كشف النصوص المكتوبة بواسطة الذكاء الاصطناعي",
+    version="2.2.0"  # Updated version with UID fix
 )
 # إضافة CORS للسماح بالاستخدام من المتصفح
     logger.info("🚀 Starting ModernBERT AI Detector...")
     logger.info(f"🐍 Python version: {sys.version}")
     logger.info(f"🔥 PyTorch version: {torch.__version__}")
+    import transformers
+    logger.info(f"🔧 Transformers version: {transformers.__version__}")
+    logger.info("🛡️ UID Monkey Patch Applied (for Docker/Container)")
     logger.info("=" * 50)
     # محاولة تحميل الموديلات
     success = model_manager.load_models(max_models=max_models)
     if success:
+        logger.info("✅ Application ready! (Fallback mode: %s)", model_manager.using_fallback)
     else:
         logger.error("⚠️ Failed to load models - API will return errors")
+        logger.info("💡 Tip: Ensure 'transformers>=4.45.0' and 'huggingface_hub' are installed. Run: pip install --upgrade transformers huggingface_hub")
 @app.get("/")
 async def root():
         "message": "ModernBERT AI Text Detector API",
         "status": "online" if model_manager.models_loaded else "initializing",
         "models_loaded": len(model_manager.models),
+        "using_fallback": model_manager.using_fallback,
         "device": str(device),
         "endpoints": {
             "analyze": "/analyze",
     return {
         "status": "healthy" if model_manager.models_loaded else "unhealthy",
         "models_loaded": len(model_manager.models),
+        "using_fallback": model_manager.using_fallback,
         "device": str(device),
         "cuda_available": torch.cuda.is_available(),
         "memory_info": memory_info
                 return DetectionResult(
                     success=False,
                     code=503,
+                    message="Models not available. Check logs for details.",
                     data={}
                 )
                 "input_text": text[:500] + "..." if len(text) > 500 else text,
                 "detected_language": "en",
                 "top_5_predictions": result.get("top_5_predictions", []),
+                "models_used": result.get("models_used", 1),
+                "using_fallback": result.get("using_fallback", False)
             }
         )
             "ai_score": result["ai_percentage"],
             "human_score": result["human_percentage"],
             "detected_model": result["predicted_model"] if result["ai_percentage"] > 50 else None,
+            "confidence": max(result["ai_percentage"], result["human_percentage"]),
+            "using_fallback": result.get("using_fallback", False)
         }
     except HTTPException:
     logger.info(f"📚 Documentation: http://{host}:{port}/docs")
     logger.info("=" * 50)
+    uvicorn.run(
+        "main:app",  # Assuming this file is named main.py
+        host=host,
+        port=port,
+        workers=workers,
+        reload=False  # Set to True for dev
+    )