Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import torch | |
| import logging | |
| import gc | |
| import sys | |
| import numpy as np | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from typing import Dict, List, Optional | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| AutoModelForCausalLM, | |
| pipeline | |
| ) | |
| from tokenizers.normalizers import Sequence, Replace, Strip | |
| from tokenizers import Regex | |
| import math | |
| from collections import Counter | |
| # ===================================================== | |
| # 🔧 تكوين البيئة والإعدادات | |
| # ===================================================== | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # إعدادات الذاكرة والكاش | |
| CACHE_DIR = "/tmp/huggingface_cache" | |
| os.makedirs(CACHE_DIR, exist_ok=True) | |
| # تكوين متغيرات البيئة لـ Hugging Face | |
| os.environ.update({ | |
| "HF_HOME": CACHE_DIR, | |
| "TRANSFORMERS_CACHE": CACHE_DIR, | |
| "HF_DATASETS_CACHE": CACHE_DIR, | |
| "HUGGINGFACE_HUB_CACHE": CACHE_DIR, | |
| "TORCH_HOME": CACHE_DIR, | |
| "TOKENIZERS_PARALLELISM": "false", | |
| "TRANSFORMERS_OFFLINE": "0", | |
| }) | |
| # إعدادات PyTorch للذاكرة | |
| if torch.cuda.is_available(): | |
| os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' | |
| torch.backends.cudnn.benchmark = True | |
| # ===================================================== | |
| # 🚀 تحديد الجهاز (GPU أو CPU) | |
| # ===================================================== | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| logger.info(f"🖥️ Using device: {device}") | |
| if torch.cuda.is_available(): | |
| logger.info(f"🎮 CUDA Device: {torch.cuda.get_device_name(0)}") | |
| logger.info(f"💾 CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB") | |
| # ===================================================== | |
| # 📊 خريطة الموديلات | |
| # ===================================================== | |
| label_mapping = { | |
| 0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b', | |
| 6: 'bloomz', 7: 'cohere', 8: 'davinci', 9: 'dolly', 10: 'dolly-v2-12b', | |
| 11: 'flan_t5_base', 12: 'flan_t5_large', 13: 'flan_t5_small', | |
| 14: 'flan_t5_xl', 15: 'flan_t5_xxl', 16: 'gemma-7b-it', 17: 'gemma2-9b-it', | |
| 18: 'gpt-3.5-turbo', 19: 'gpt-35', 20: 'gpt4', 21: 'gpt4o', | |
| 22: 'gpt_j', 23: 'gpt_neox', 24: 'human', 25: 'llama3-70b', 26: 'llama3-8b', | |
| 27: 'mixtral-8x7b', 28: 'opt_1.3b', 29: 'opt_125m', 30: 'opt_13b', | |
| 31: 'opt_2.7b', 32: 'opt_30b', 33: 'opt_350m', 34: 'opt_6.7b', | |
| 35: 'opt_iml_30b', 36: 'opt_iml_max_1.3b', 37: 't0_11b', 38: 't0_3b', | |
| 39: 'text-davinci-002', 40: 'text-davinci-003' | |
| } | |
| # ===================================================== | |
| # 📈 حسابات Perplexity و Burstiness | |
| # ===================================================== | |
| class TextMetrics: | |
| """حساب المقاييس الإحصائية للنص""" | |
| def calculate_perplexity(text: str, model=None, tokenizer=None): | |
| """ | |
| حساب Perplexity - قياس مدى "تفاجؤ" الموديل بالنص | |
| نصوص AI عادة لها perplexity أقل (أكثر قابلية للتنبؤ) | |
| """ | |
| try: | |
| if model is None or tokenizer is None: | |
| # حساب تقريبي بناءً على تكرار الكلمات | |
| words = text.lower().split() | |
| word_freq = Counter(words) | |
| total_words = len(words) | |
| # حساب entropy | |
| entropy = 0 | |
| for count in word_freq.values(): | |
| probability = count / total_words | |
| if probability > 0: | |
| entropy -= probability * math.log2(probability) | |
| # تقريب perplexity | |
| perplexity = 2 ** entropy | |
| return min(perplexity, 1000) # Cap at 1000 | |
| else: | |
| # حساب حقيقي باستخدام موديل | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) | |
| with torch.no_grad(): | |
| outputs = model(**inputs, labels=inputs["input_ids"]) | |
| loss = outputs.loss | |
| perplexity = torch.exp(loss).item() | |
| return min(perplexity, 1000) | |
| except Exception as e: | |
| logger.warning(f"Error calculating perplexity: {e}") | |
| return 50.0 # Default value | |
| def calculate_burstiness(text: str): | |
| """ | |
| حساب Burstiness - قياس التنوع في طول الجمل | |
| البشر عندهم burstiness أعلى (جمل متنوعة الطول) | |
| AI عادة أكثر اتساقاً | |
| """ | |
| try: | |
| # تقسيم النص لجمل | |
| sentences = re.split(r'[.!?]+', text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| if len(sentences) < 2: | |
| return 0.0 | |
| # حساب طول كل جملة | |
| sentence_lengths = [len(s.split()) for s in sentences] | |
| # حساب الانحراف المعياري والمتوسط | |
| mean_length = np.mean(sentence_lengths) | |
| std_length = np.std(sentence_lengths) | |
| # Burstiness = الانحراف المعياري / المتوسط | |
| if mean_length > 0: | |
| burstiness = std_length / mean_length | |
| else: | |
| burstiness = 0.0 | |
| return round(burstiness, 4) | |
| except Exception as e: | |
| logger.warning(f"Error calculating burstiness: {e}") | |
| return 0.5 | |
| def calculate_vocabulary_diversity(text: str): | |
| """ | |
| حساب تنوع المفردات | |
| البشر يستخدمون كلمات أكثر تنوعاً | |
| """ | |
| words = text.lower().split() | |
| unique_words = set(words) | |
| if len(words) > 0: | |
| diversity = len(unique_words) / len(words) | |
| else: | |
| diversity = 0 | |
| return round(diversity, 4) | |
| def detect_ai_patterns(text: str): | |
| """ | |
| كشف الأنماط الشائعة في نصوص AI | |
| """ | |
| ai_patterns = [ | |
| r"it['\s]+s important to note", | |
| r"in conclusion", | |
| r"furthermore", | |
| r"comprehensive understanding", | |
| r"it is worth noting", | |
| r"however, it should be noted", | |
| r"on the other hand", | |
| r"in summary", | |
| r"to begin with", | |
| r"first and foremost" | |
| ] | |
| pattern_count = 0 | |
| for pattern in ai_patterns: | |
| if re.search(pattern, text.lower()): | |
| pattern_count += 1 | |
| return pattern_count | |
| def detect_human_patterns(text: str): | |
| """ | |
| كشف الأنماط الشائعة في الكتابة البشرية | |
| """ | |
| human_patterns = [ | |
| r"kinda|sorta|gonna|wanna|gotta", | |
| r"tbh|idk|lol|omg|btw", | |
| r"!{2,}|\?{2,}|\.{3,}", | |
| r"i think|i feel|i believe", | |
| r"like,|you know,|i mean,", | |
| r"anyway|anyhow|whatever" | |
| ] | |
| pattern_count = 0 | |
| for pattern in human_patterns: | |
| if re.search(pattern, text.lower()): | |
| pattern_count += 1 | |
| return pattern_count | |
| # ===================================================== | |
| # 🤖 Model Manager - إدارة الموديلات المحسنة | |
| # ===================================================== | |
| class EnhancedModelManager: | |
| def __init__(self): | |
| self.modernbert_tokenizer = None | |
| self.modernbert_models = [] | |
| self.additional_models = {} | |
| self.additional_tokenizers = {} | |
| self.models_loaded = False | |
| self.metrics = TextMetrics() | |
| # ModernBERT URLs | |
| self.modernbert_urls = [ | |
| "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12", | |
| "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22" | |
| ] | |
| # Additional models to try | |
| self.additional_model_configs = [ | |
| { | |
| "name": "chatgpt-detector-roberta", | |
| "model_id": "Hello-SimpleAI/chatgpt-detector-roberta", | |
| "type": "classification" | |
| }, | |
| { | |
| "name": "openai-detector", | |
| "model_id": "roberta-base-openai-detector", | |
| "type": "classification" | |
| }, | |
| { | |
| "name": "ai-content-detector", | |
| "model_id": "PirateXX/AI-Content-Detector", | |
| "type": "classification" | |
| } | |
| ] | |
| def load_modernbert_tokenizer(self): | |
| """تحميل ModernBERT tokenizer""" | |
| try: | |
| logger.info("📝 Loading ModernBERT tokenizer...") | |
| self.modernbert_tokenizer = AutoTokenizer.from_pretrained( | |
| "answerdotai/ModernBERT-base", | |
| cache_dir=CACHE_DIR, | |
| use_fast=True, | |
| trust_remote_code=False | |
| ) | |
| # إعداد معالج النصوص | |
| try: | |
| newline_to_space = Replace(Regex(r'\s*\n\s*'), " ") | |
| join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2") | |
| self.modernbert_tokenizer.backend_tokenizer.normalizer = Sequence([ | |
| self.modernbert_tokenizer.backend_tokenizer.normalizer, | |
| join_hyphen_break, | |
| newline_to_space, | |
| Strip() | |
| ]) | |
| except Exception as e: | |
| logger.warning(f"⚠️ Could not set custom normalizer: {e}") | |
| logger.info("✅ ModernBERT tokenizer loaded") | |
| return True | |
| except Exception as e: | |
| logger.error(f"❌ Failed to load tokenizer: {e}") | |
| return False | |
| def load_modernbert_model(self, model_url=None, model_path=None, model_name="ModernBERT"): | |
| """تحميل موديل ModernBERT واحد""" | |
| try: | |
| logger.info(f"🤖 Loading {model_name}...") | |
| base_model = AutoModelForSequenceClassification.from_pretrained( | |
| "answerdotai/ModernBERT-base", | |
| num_labels=41, | |
| cache_dir=CACHE_DIR, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=False | |
| ) | |
| if model_path and os.path.exists(model_path): | |
| logger.info(f"📁 Loading from local file: {model_path}") | |
| state_dict = torch.load(model_path, map_location=device, weights_only=True) | |
| base_model.load_state_dict(state_dict, strict=False) | |
| elif model_url: | |
| logger.info(f"🌐 Downloading weights from URL...") | |
| try: | |
| state_dict = torch.hub.load_state_dict_from_url( | |
| model_url, | |
| map_location=device, | |
| progress=True, | |
| check_hash=False, | |
| file_name=f"{model_name}.pt" | |
| ) | |
| base_model.load_state_dict(state_dict, strict=False) | |
| except Exception as e: | |
| logger.warning(f"⚠️ Could not load weights: {e}") | |
| logger.info("📊 Using model with random initialization") | |
| model = base_model.to(device) | |
| model.eval() | |
| if 'state_dict' in locals(): | |
| del state_dict | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| logger.info(f"✅ {model_name} loaded") | |
| return model | |
| except Exception as e: | |
| logger.error(f"❌ Failed to load {model_name}: {e}") | |
| return None | |
| def load_additional_model(self, model_config): | |
| """تحميل موديلات إضافية للكشف عن AI""" | |
| try: | |
| model_name = model_config["name"] | |
| model_id = model_config["model_id"] | |
| logger.info(f"🔧 Loading {model_name}...") | |
| # Try loading as a pipeline first (easier) | |
| try: | |
| classifier = pipeline( | |
| "text-classification", | |
| model=model_id, | |
| device=0 if torch.cuda.is_available() else -1, | |
| model_kwargs={"cache_dir": CACHE_DIR} | |
| ) | |
| self.additional_models[model_name] = classifier | |
| logger.info(f"✅ {model_name} loaded as pipeline") | |
| return True | |
| except: | |
| # Try loading manually | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_id, | |
| cache_dir=CACHE_DIR | |
| ) | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| model_id, | |
| cache_dir=CACHE_DIR, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 | |
| ).to(device) | |
| model.eval() | |
| self.additional_tokenizers[model_name] = tokenizer | |
| self.additional_models[model_name] = model | |
| logger.info(f"✅ {model_name} loaded manually") | |
| return True | |
| except Exception as e: | |
| logger.warning(f"⚠️ Could not load {model_config['name']}: {e}") | |
| return False | |
| def load_all_models(self, max_modernbert=2, load_additional=True): | |
| """تحميل جميع الموديلات""" | |
| if self.models_loaded: | |
| logger.info("✨ Models already loaded") | |
| return True | |
| # Load ModernBERT tokenizer | |
| if not self.load_modernbert_tokenizer(): | |
| return False | |
| # Load ModernBERT models | |
| logger.info(f"🚀 Loading up to {max_modernbert} ModernBERT models...") | |
| # Try local file first | |
| local_path = "modernbert.bin" | |
| if os.path.exists(local_path): | |
| model = self.load_modernbert_model( | |
| model_path=local_path, | |
| model_name="ModernBERT-Local" | |
| ) | |
| if model is not None: | |
| self.modernbert_models.append(model) | |
| # Load from URLs | |
| for i, url in enumerate(self.modernbert_urls[:max_modernbert - len(self.modernbert_models)]): | |
| if len(self.modernbert_models) >= max_modernbert: | |
| break | |
| model = self.load_modernbert_model( | |
| model_url=url, | |
| model_name=f"ModernBERT-{i+1}" | |
| ) | |
| if model is not None: | |
| self.modernbert_models.append(model) | |
| # Load additional models | |
| if load_additional: | |
| logger.info("🎯 Loading additional AI detection models...") | |
| for config in self.additional_model_configs: | |
| self.load_additional_model(config) | |
| # Check success | |
| total_models = len(self.modernbert_models) + len(self.additional_models) | |
| if total_models > 0: | |
| self.models_loaded = True | |
| logger.info(f"✅ Loaded {len(self.modernbert_models)} ModernBERT + {len(self.additional_models)} additional models") | |
| return True | |
| else: | |
| logger.error("❌ No models could be loaded") | |
| return False | |
| def classify_with_modernbert(self, text: str, model_index: int): | |
| """تصنيف النص باستخدام موديل ModernBERT واحد""" | |
| try: | |
| if model_index >= len(self.modernbert_models): | |
| return None | |
| model = self.modernbert_models[model_index] | |
| cleaned_text = clean_text(text) | |
| inputs = self.modernbert_tokenizer( | |
| cleaned_text, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=512, | |
| padding=True | |
| ).to(device) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| probs = torch.softmax(logits[0], dim=0) | |
| human_prob = probs[24].item() | |
| ai_probs = probs.clone() | |
| ai_probs[24] = 0 | |
| ai_total = ai_probs.sum().item() | |
| total = human_prob + ai_total | |
| if total > 0: | |
| human_pct = (human_prob / total) * 100 | |
| ai_pct = (ai_total / total) * 100 | |
| else: | |
| human_pct = ai_pct = 50 | |
| ai_model_idx = torch.argmax(ai_probs).item() | |
| return { | |
| "model_name": f"ModernBERT-{model_index+1}", | |
| "human_score": round(human_pct, 2), | |
| "ai_score": round(ai_pct, 2), | |
| "predicted_model": label_mapping.get(ai_model_idx, "Unknown"), | |
| "confidence": round(max(human_pct, ai_pct), 2) | |
| } | |
| except Exception as e: | |
| logger.error(f"Error in ModernBERT {model_index}: {e}") | |
| return None | |
| def classify_with_additional(self, text: str, model_name: str): | |
| """تصنيف النص باستخدام موديل إضافي""" | |
| try: | |
| if model_name not in self.additional_models: | |
| return None | |
| model = self.additional_models[model_name] | |
| # Check if it's a pipeline or model | |
| if hasattr(model, '__call__'): | |
| # It's a pipeline | |
| result = model(text, truncation=True, max_length=512) | |
| # Parse results based on model output format | |
| ai_score = 0 | |
| human_score = 0 | |
| for item in result: | |
| label = item['label'].lower() | |
| score = item['score'] * 100 | |
| if 'fake' in label or 'ai' in label or 'gpt' in label: | |
| ai_score = max(ai_score, score) | |
| elif 'real' in label or 'human' in label: | |
| human_score = max(human_score, score) | |
| # Normalize if needed | |
| if ai_score == 0 and human_score == 0: | |
| ai_score = human_score = 50 | |
| return { | |
| "model_name": model_name, | |
| "human_score": round(human_score, 2), | |
| "ai_score": round(ai_score, 2), | |
| "predicted_model": "AI" if ai_score > human_score else "Human", | |
| "confidence": round(max(ai_score, human_score), 2) | |
| } | |
| else: | |
| # It's a model, use tokenizer | |
| tokenizer = self.additional_tokenizers.get(model_name) | |
| if tokenizer is None: | |
| return None | |
| inputs = tokenizer( | |
| text, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=512, | |
| padding=True | |
| ).to(device) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| probs = torch.softmax(outputs.logits[0], dim=0) | |
| # Assuming binary classification (AI vs Human) | |
| if len(probs) == 2: | |
| human_score = probs[0].item() * 100 | |
| ai_score = probs[1].item() * 100 | |
| else: | |
| # Handle multi-class | |
| ai_score = human_score = 50 | |
| return { | |
| "model_name": model_name, | |
| "human_score": round(human_score, 2), | |
| "ai_score": round(ai_score, 2), | |
| "predicted_model": "AI" if ai_score > human_score else "Human", | |
| "confidence": round(max(ai_score, human_score), 2) | |
| } | |
| except Exception as e: | |
| logger.warning(f"Error in {model_name}: {e}") | |
| return None | |
| def comprehensive_analysis(self, text: str): | |
| """تحليل شامل باستخدام جميع الموديلات والمقاييس""" | |
| if not self.models_loaded: | |
| raise ValueError("No models loaded") | |
| results = { | |
| "individual_models": [], | |
| "ensemble_result": {}, | |
| "metrics": {}, | |
| "pattern_analysis": {} | |
| } | |
| # 1. Calculate text metrics | |
| logger.info("📊 Calculating text metrics...") | |
| results["metrics"] = { | |
| "perplexity": self.metrics.calculate_perplexity(text), | |
| "burstiness": self.metrics.calculate_burstiness(text), | |
| "vocabulary_diversity": self.metrics.calculate_vocabulary_diversity(text), | |
| "text_length": len(text.split()), | |
| "sentence_count": len(re.split(r'[.!?]+', text)) | |
| } | |
| # 2. Pattern detection | |
| results["pattern_analysis"] = { | |
| "ai_patterns_found": self.metrics.detect_ai_patterns(text), | |
| "human_patterns_found": self.metrics.detect_human_patterns(text) | |
| } | |
| # 3. Run ModernBERT models | |
| modernbert_results = [] | |
| for i in range(len(self.modernbert_models)): | |
| result = self.classify_with_modernbert(text, i) | |
| if result: | |
| results["individual_models"].append(result) | |
| modernbert_results.append(result) | |
| # 4. Run additional models | |
| for model_name in self.additional_models.keys(): | |
| result = self.classify_with_additional(text, model_name) | |
| if result: | |
| results["individual_models"].append(result) | |
| # 5. Calculate ensemble result (weighted average) | |
| if results["individual_models"]: | |
| total_ai = 0 | |
| total_human = 0 | |
| weights_sum = 0 | |
| for i, result in enumerate(results["individual_models"]): | |
| # Give ModernBERT models higher weight | |
| weight = 1.5 if i < len(modernbert_results) else 1.0 | |
| total_ai += result["ai_score"] * weight | |
| total_human += result["human_score"] * weight | |
| weights_sum += weight | |
| if weights_sum > 0: | |
| ensemble_ai = total_ai / weights_sum | |
| ensemble_human = total_human / weights_sum | |
| else: | |
| ensemble_ai = ensemble_human = 50 | |
| # Adjust based on metrics | |
| # High perplexity suggests human text | |
| if results["metrics"]["perplexity"] > 100: | |
| ensemble_human += 5 | |
| ensemble_ai -= 5 | |
| elif results["metrics"]["perplexity"] < 30: | |
| ensemble_ai += 5 | |
| ensemble_human -= 5 | |
| # High burstiness suggests human text | |
| if results["metrics"]["burstiness"] > 0.8: | |
| ensemble_human += 5 | |
| ensemble_ai -= 5 | |
| elif results["metrics"]["burstiness"] < 0.3: | |
| ensemble_ai += 5 | |
| ensemble_human -= 5 | |
| # Pattern analysis adjustment | |
| pattern_adjustment = (results["pattern_analysis"]["ai_patterns_found"] - | |
| results["pattern_analysis"]["human_patterns_found"]) * 3 | |
| ensemble_ai += pattern_adjustment | |
| ensemble_human -= pattern_adjustment | |
| # Normalize to 100% | |
| total = ensemble_ai + ensemble_human | |
| if total > 0: | |
| ensemble_ai = (ensemble_ai / total) * 100 | |
| ensemble_human = (ensemble_human / total) * 100 | |
| # Determine most likely AI model | |
| if ensemble_ai > ensemble_human and modernbert_results: | |
| predicted_model = modernbert_results[0]["predicted_model"] | |
| else: | |
| predicted_model = "Human" | |
| results["ensemble_result"] = { | |
| "ai_percentage": round(min(max(ensemble_ai, 0), 100), 2), | |
| "human_percentage": round(min(max(ensemble_human, 0), 100), 2), | |
| "predicted_model": predicted_model, | |
| "confidence": round(max(ensemble_ai, ensemble_human), 2), | |
| "is_human": ensemble_human > ensemble_ai, | |
| "models_used": len(results["individual_models"]) | |
| } | |
| return results | |
| # ===================================================== | |
| # 🧹 دوال التنظيف والمعالجة | |
| # ===================================================== | |
| def clean_text(text: str) -> str: | |
| """تنظيف النص من المسافات الزائدة""" | |
| text = re.sub(r'\s{2,}', ' ', text) | |
| text = re.sub(r'\s+([,.;:?!])', r'\1', text) | |
| return text.strip() | |
| def split_into_paragraphs(text: str) -> List[str]: | |
| """تقسيم النص إلى فقرات""" | |
| paragraphs = re.split(r'\n\s*\n', text.strip()) | |
| return [p.strip() for p in paragraphs if p.strip()] | |
| # ===================================================== | |
| # 🌐 FastAPI Application | |
| # ===================================================== | |
| app = FastAPI( | |
| title="Enhanced ModernBERT AI Detector", | |
| description="Advanced AI detection with multiple models, perplexity, and burstiness analysis", | |
| version="3.0.0" | |
| ) | |
| # إضافة CORS | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # إنشاء مدير الموديلات المحسن | |
| model_manager = EnhancedModelManager() | |
| # ===================================================== | |
| # 📝 نماذج البيانات (Pydantic Models) | |
| # ===================================================== | |
| class TextInput(BaseModel): | |
| text: str | |
| analyze_paragraphs: Optional[bool] = False | |
| return_individual_scores: Optional[bool] = True | |
| class SimpleTextInput(BaseModel): | |
| text: str | |
| class EnhancedDetectionResult(BaseModel): | |
| success: bool | |
| code: int | |
| message: str | |
| data: Dict | |
| # ===================================================== | |
| # 🎯 API Endpoints | |
| # ===================================================== | |
| async def startup_event(): | |
| """تحميل الموديلات عند بداية التشغيل""" | |
| logger.info("=" * 50) | |
| logger.info("🚀 Starting Enhanced ModernBERT AI Detector...") | |
| logger.info(f"🐍 Python version: {sys.version}") | |
| logger.info(f"🔥 PyTorch version: {torch.__version__}") | |
| logger.info("=" * 50) | |
| # Load models | |
| max_modernbert = int(os.environ.get("MAX_MODERNBERT_MODELS", "2")) | |
| load_additional = os.environ.get("LOAD_ADDITIONAL_MODELS", "true").lower() == "true" | |
| success = model_manager.load_all_models( | |
| max_modernbert=max_modernbert, | |
| load_additional=load_additional | |
| ) | |
| if success: | |
| logger.info("✅ Application ready with enhanced features!") | |
| else: | |
| logger.error("⚠️ Failed to load models - API will return errors") | |
| async def root(): | |
| """الصفحة الرئيسية""" | |
| models_info = { | |
| "modernbert_models": len(model_manager.modernbert_models), | |
| "additional_models": list(model_manager.additional_models.keys()) | |
| } | |
| return { | |
| "message": "Enhanced ModernBERT AI Text Detector API", | |
| "status": "online" if model_manager.models_loaded else "initializing", | |
| "models": models_info, | |
| "device": str(device), | |
| "features": [ | |
| "Multiple AI detection models", | |
| "Perplexity analysis", | |
| "Burstiness analysis", | |
| "Pattern detection", | |
| "Individual model scores", | |
| "Ensemble predictions" | |
| ], | |
| "endpoints": { | |
| "analyze": "/analyze", | |
| "simple": "/analyze-simple", | |
| "health": "/health", | |
| "docs": "/docs" | |
| } | |
| } | |
| async def health_check(): | |
| """فحص صحة الخدمة""" | |
| memory_info = {} | |
| if torch.cuda.is_available(): | |
| memory_info = { | |
| "gpu_allocated_gb": round(torch.cuda.memory_allocated() / 1024**3, 2), | |
| "gpu_reserved_gb": round(torch.cuda.memory_reserved() / 1024**3, 2) | |
| } | |
| return { | |
| "status": "healthy" if model_manager.models_loaded else "unhealthy", | |
| "modernbert_models": len(model_manager.modernbert_models), | |
| "additional_models": len(model_manager.additional_models), | |
| "total_models": len(model_manager.modernbert_models) + len(model_manager.additional_models), | |
| "device": str(device), | |
| "cuda_available": torch.cuda.is_available(), | |
| "memory_info": memory_info | |
| } | |
| async def analyze_text_enhanced(data: TextInput): | |
| """ | |
| Enhanced analysis with multiple models and metrics | |
| """ | |
| try: | |
| # Validate input | |
| text = data.text.strip() | |
| if not text: | |
| return EnhancedDetectionResult( | |
| success=False, | |
| code=400, | |
| message="Empty input text", | |
| data={} | |
| ) | |
| # Ensure models are loaded | |
| if not model_manager.models_loaded: | |
| if not model_manager.load_all_models(): | |
| return EnhancedDetectionResult( | |
| success=False, | |
| code=503, | |
| message="Models not available", | |
| data={} | |
| ) | |
| # Comprehensive analysis | |
| analysis_result = model_manager.comprehensive_analysis(text) | |
| # Basic stats | |
| total_words = len(text.split()) | |
| ai_percentage = analysis_result["ensemble_result"]["ai_percentage"] | |
| human_percentage = analysis_result["ensemble_result"]["human_percentage"] | |
| ai_words = int(total_words * (ai_percentage / 100)) | |
| # Paragraph analysis if requested | |
| paragraphs_analysis = [] | |
| if data.analyze_paragraphs: | |
| paragraphs = split_into_paragraphs(text) | |
| for para in paragraphs[:10]: | |
| if para.strip(): | |
| try: | |
| para_result = model_manager.comprehensive_analysis(para) | |
| para_words = len(para.split()) | |
| paragraphs_analysis.append({ | |
| "paragraph": para[:200] + "..." if len(para) > 200 else para, | |
| "ai_generated_score": para_result["ensemble_result"]["ai_percentage"] / 100, | |
| "human_written_score": para_result["ensemble_result"]["human_percentage"] / 100, | |
| "predicted_model": para_result["ensemble_result"]["predicted_model"], | |
| "metrics": { | |
| "perplexity": para_result["metrics"]["perplexity"], | |
| "burstiness": para_result["metrics"]["burstiness"] | |
| } | |
| }) | |
| except Exception as e: | |
| logger.warning(f"Failed to analyze paragraph: {e}") | |
| # Prepare response | |
| response_data = { | |
| "fakePercentage": ai_percentage, | |
| "isHuman": human_percentage, | |
| "textWords": total_words, | |
| "aiWords": ai_words, | |
| "predicted_model": analysis_result["ensemble_result"]["predicted_model"], | |
| "feedback": "Most of Your Text is AI/GPT Generated" if ai_percentage > 50 else "Most of Your Text Appears Human-Written", | |
| "confidence": analysis_result["ensemble_result"]["confidence"], | |
| "models_used": analysis_result["ensemble_result"]["models_used"], | |
| # New: Metrics | |
| "metrics": analysis_result["metrics"], | |
| # New: Pattern analysis | |
| "pattern_analysis": analysis_result["pattern_analysis"], | |
| # Paragraphs if requested | |
| "paragraphs": paragraphs_analysis, | |
| # Text preview | |
| "input_text": text[:500] + "..." if len(text) > 500 else text, | |
| "detected_language": "en" | |
| } | |
| # Add individual model scores if requested | |
| if data.return_individual_scores: | |
| response_data["individual_models"] = analysis_result["individual_models"] | |
| return EnhancedDetectionResult( | |
| success=True, | |
| code=200, | |
| message="Enhanced analysis completed", | |
| data=response_data | |
| ) | |
| except Exception as e: | |
| logger.error(f"Analysis error: {e}", exc_info=True) | |
| return EnhancedDetectionResult( | |
| success=False, | |
| code=500, | |
| message=f"Analysis failed: {str(e)}", | |
| data={} | |
| ) | |
| async def analyze_simple(data: SimpleTextInput): | |
| """ | |
| Simple analysis - returns basic results only | |
| """ | |
| try: | |
| text = data.text.strip() | |
| if not text: | |
| raise HTTPException(status_code=400, detail="Empty text") | |
| if not model_manager.models_loaded: | |
| if not model_manager.load_all_models(): | |
| raise HTTPException(status_code=503, detail="Models not available") | |
| result = model_manager.comprehensive_analysis(text) | |
| ensemble = result["ensemble_result"] | |
| return { | |
| "is_ai": ensemble["ai_percentage"] > 50, | |
| "ai_score": ensemble["ai_percentage"], | |
| "human_score": ensemble["human_percentage"], | |
| "detected_model": ensemble["predicted_model"], | |
| "confidence": ensemble["confidence"], | |
| "perplexity": result["metrics"]["perplexity"], | |
| "burstiness": result["metrics"]["burstiness"] | |
| } | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| logger.error(f"Simple analysis error: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # ===================================================== | |
| # 🏃 تشغيل التطبيق | |
| # ===================================================== | |
| if __name__ == "__main__": | |
| import uvicorn | |
| port = int(os.environ.get("PORT", 8000)) | |
| host = os.environ.get("HOST", "0.0.0.0") | |
| workers = int(os.environ.get("WORKERS", 1)) | |
| logger.info("=" * 50) | |
| logger.info(f"🌐 Starting enhanced server on {host}:{port}") | |
| logger.info(f"👷 Workers: {workers}") | |
| logger.info(f"📚 Documentation: http://{host}:{port}/docs") | |
| logger.info("=" * 50) | |
| uvicorn.run( | |
| "app_enhanced:app", | |
| host=host, | |
| port=port, | |
| reload=False, | |
| workers=workers, | |
| log_level="info" | |
| ) |