Spaces:
Running
Running
| """ | |
| مصنف المحتوى الطبي والعلمي (Medical & Scientific Content Classifier) | |
| ===================================================================== | |
| نظام تصنيف احتمالي متخصص في المحتوى الطبي والعلمي. | |
| التصنيفات المدعومة: | |
| - orthopedic: جراحة العظام والمفاصل | |
| - cardiology: أمراض القلب والأوعية الدموية | |
| - neurology: الأمراض العصبية | |
| - general_surgery: الجراحة العامة | |
| - radiology: الأشعة والتصوير الطبي | |
| - pathology: علم الأمراض | |
| - pharmacology: علم الأدوية | |
| - research: أبحاث علمية | |
| - medical_admin: إدارة طبية وتقارير | |
| - engineering: هندسة وتقنية | |
| - general: عام (غير مصنف) | |
| الاستخدام: | |
| from modules.core.classifier import MedicalClassifier | |
| clf = MedicalClassifier() | |
| result = clf.classify("المريض يعاني من كسر في عظم الفخذ") | |
| """ | |
| import json | |
| import logging | |
| import os | |
| import re | |
| from typing import Optional, Dict, Any, List | |
| logger = logging.getLogger(__name__) | |
| class MedicalClassifier: | |
| """ | |
| مصنف المحتوى الطبي والعلمي — يعتمد على الكلمات المفتاحية مع نظام أوزان احتمالية. | |
| كل تصنيف له كلمات مفتاحية بأوزان مختلفة: | |
| - weight=3: كلمات حاسمة (تحدد التصنيف بقوة) | |
| - weight=2: كلمات مهمة (تدعم التصنيف) | |
| - weight=1: كلمات مساعدة (تزيد الثقة) | |
| """ | |
| # التصنيفات الافتراضية مع الكلمات المفتاحية وأوزانها | |
| _DEFAULT_CATEGORIES: Dict[str, Dict[str, List[str]]] = { | |
| "orthopedic": { | |
| "critical": [ | |
| "كسر", " fracture", "عظم", " bone", "مفصل", " joint", | |
| "عمود فقري", " spine", "فقرات", " vertebrae", | |
| "الركبة", " knee", "الحوض", " pelvis", | |
| "الكتف", " shoulder", "الكاحل", " ankle", | |
| "الرسغ", " wrist", "المرفق", " elbow", | |
| "عظم العضد", " humerus", "عظم الساعد", " forearm", | |
| "الساق", " leg", "الفخذ", " femur", | |
| "الظنبوب", " tibia", "الشظية", " fibula", | |
| "الترقوة", " clavicle", "ضلع", " rib", | |
| "تثبيت", " fixation", "مسمار", " screw", | |
| "صفيحة", " plate", "سلك", " wire", | |
| "مسامير", " pins", "دعامة", " implant", | |
| "بدلة مفصل", " prosthesis", "مفصل صناعي", | |
| "خشونة", " osteoarthritis", "التهاب المفاصل", " arthritis", | |
| "انزلاق غضروفي", " disc herniation", "ديسك", | |
| "التهاب", " tendonitis", "وترة", " tendon", | |
| "رباط", " ligament", "غضروف", " cartilage", | |
| "تمزق", " tear", "تمزق الرباط الصليبي", " ACL", | |
| "إصابة رياضية", " sports injury", | |
| "استئصال", " excision", "تنظير", " arthroscopy", | |
| "تقويم", " orthopedic", "جراحة عظام", " orthopaedic", | |
| "شبكة عظمية", " bone graft", "زراعة عظم", | |
| "تقويم عظام", " osteotomy", "ربط", " fusion", | |
| "استبدال المفصل", " joint replacement", "arthroplasty", | |
| "التهاب العظم والنقي", " osteomyelitis", | |
| "التهاب المفاصل الروماتويدي", " rheumatoid", | |
| "النقرس", " gout", "هشاشة العظام", " osteoporosis", | |
| "انزلاق", " spondylolisthesis", "جنف", " scoliosis", | |
| "تحدب", " kyphosis", "قدم مسطحة", " flat foot", | |
| "التهاب الكيس", " bursitis", "التهاب الجراب", | |
| ], | |
| "important": [ | |
| "مريض", " patient", "عملية", " surgery", "جراحة", | |
| "تشخيص", " diagnosis", "علاج", " treatment", | |
| "أشعة سينية", " x-ray", "رنين مغناطيسي", " MRI", | |
| "مقطعية", " CT scan", "تصوير", " imaging", | |
| "تخدير", " anesthesia", "مضاد حيوي", " antibiotic", | |
| "مستشفى", " hospital", "عيادة", " clinic", | |
| "جبس", " cast", "حزام", " brace", "رباط طبي", | |
| "تأهيل", " rehabilitation", "فيزيوترابي", " physiotherapy", | |
| "علاج طبيعي", " physical therapy", | |
| ], | |
| "supporting": [ | |
| "دراسة", " study", "بحث", " research", "تحليل", | |
| "متابعة", " follow-up", "مراجعة", " review", | |
| "حالة", " case", "تقرير", " report", | |
| "توصيات", " recommendations", "خطة علاجية", | |
| ], | |
| }, | |
| "cardiology": { | |
| "critical": [ | |
| "القلب", " heart", "شريان", " artery", "وريد", " vein", | |
| "أزمة قلبية", " myocardial infarction", "سكتة قلبية", | |
| "ذبحة", " angina", "قصور قلبي", " heart failure", | |
| "صمام", " valve", "رجفان أذيني", " atrial fibrillation", | |
| "تصلب شرايين", " atherosclerosis", "جلطة", " clot", | |
| "خثرة", " thrombus", "انسداد", " occlusion", | |
| "ضغط الدم", " blood pressure", "الكولسترول", " cholesterol", | |
| "ترقق الشرايين", "aneurysm", "قسطرة", " catheter", | |
| "دعامة قلبية", " stent", "مجازة", " bypass", | |
| "نظم قلبي", " pacemaker", "صدمة قلبية", " cardiac shock", | |
| "التهاب التامور", " pericarditis", "التهاب عضلة القلب", " myocarditis", | |
| ], | |
| "important": [ | |
| "تخطيط قلب", " ECG", "إيكو قلب", " echocardiography", | |
| "أشعة قلب", " coronary angiography", | |
| "مريض", " patient", "عملية", " surgery", | |
| "علاج", " treatment", "تشخيص", " diagnosis", | |
| ], | |
| "supporting": [ | |
| "متابعة", " follow-up", "تقرير", " report", | |
| "خطر", " risk", "مضاعفات", " complications", | |
| ], | |
| }, | |
| "neurology": { | |
| "critical": [ | |
| "الجهاز العصبي", " nervous system", "الدماغ", " brain", | |
| "الحبل الشوكي", " spinal cord", "عصب", " nerve", | |
| "صرع", " epilepsy", "تصلب متعدد", " multiple sclerosis", | |
| "باركنسون", " Parkinson", "زهايمر", " Alzheimer", | |
| "سكتة دماغية", " stroke", "شلل", " paralysis", | |
| "ألم عصبي", " neuropathy", "صداع نصفي", " migraine", | |
| "ورم دماغي", " brain tumor", "التهاب السحايا", " meningitis", | |
| "اعتلال الأعصاب", " neuropathy", "ضمور عضلي", " muscular dystrophy", | |
| ], | |
| "important": [ | |
| "تشخيص عصبي", " neurological diagnosis", | |
| "رنين مغناطيسي دماغي", " brain MRI", | |
| "تخطيط كهربائي", " EEG", "علاج", " treatment", | |
| ], | |
| "supporting": [ | |
| "متابعة", " follow-up", "تقرير", " report", | |
| "حالة", " case", "بحث", " research", | |
| ], | |
| }, | |
| "general_surgery": { | |
| "critical": [ | |
| "استئصال", " excision", "appendectomy", "استئصال زائدة", | |
| "استئصال مرارة", " cholecystectomy", "فتق", " hernia", | |
| "جراحة", " surgery", "عملية جراحية", " surgical operation", | |
| "شق", " incision", "غلق", " closure", "خياطة", " suture", | |
| "تنظير", " laparoscopy", "منظار البطن", | |
| "استئصال طحال", " splenectomy", | |
| "استئصال غدة", " gland excision", "درن", " thyroid", | |
| "استئصال ثدي", " mastectomy", | |
| ], | |
| "important": [ | |
| "مريض", " patient", "تخدير", " anesthesia", | |
| "مضاد حيوي", " antibiotic", "عناية مركزة", " ICU", | |
| "مستشفى", " hospital", "تعقيم", " sterilization", | |
| ], | |
| "supporting": [ | |
| "تقرير", " report", "متابعة", " follow-up", | |
| "مضاعفات", " complications", "تشخيص", " diagnosis", | |
| ], | |
| }, | |
| "radiology": { | |
| "critical": [ | |
| "أشعة", " radiology", "x-ray", "تصوير طبي", | |
| "رنين مغناطيسي", " MRI", "مقطعية", " CT", | |
| "سونار", " ultrasound", "موجات فوق صوتية", | |
| "أشعة مقطعية", " CT scan", "تصوير وعائي", " angiography", | |
| "تصوير الصدر", " chest imaging", "ماموجرام", " mammogram", | |
| "تصوير نخاعي", " myelography", "بيتيد", " PET scan", | |
| "فلوروسكوبي", " fluoroscopy", "ديكسا", " DEXA", | |
| ], | |
| "important": [ | |
| "صورة شعاعية", " radiograph", "تقرير أشعة", | |
| "ظل", " opacity", "ارتشاح", " infiltration", | |
| "ورم", " tumor", "كتلة", " mass", "آفة", " lesion", | |
| ], | |
| "supporting": [ | |
| "تشخيص", " diagnosis", "مقارنة", " comparison", | |
| "توصيات", " recommendations", "متابعة", " follow-up", | |
| ], | |
| }, | |
| "pathology": { | |
| "critical": [ | |
| "علم الأمراض", " pathology", "فحص نسيجي", " biopsy", | |
| "خزعة", " biopsy", "فحص مجهري", " microscopic", | |
| "خلايا سرطانية", " cancer cells", "ورم خبيث", " malignant", | |
| "ورم حميد", " benign", "سرطان", " cancer", | |
| "نسيج", " tissue", "خلية", " cell", | |
| "درجة ورمية", " tumor grade", "مرحلة", " stage", | |
| "انتشار", " metastasis", "غدة لمفاوية", " lymph node", | |
| "نتائج فحص", " lab results", "تحليل مخبري", " lab analysis", | |
| ], | |
| "important": [ | |
| "تشخيص نهائي", " definitive diagnosis", | |
| "تقرير مرضي", " pathology report", | |
| "ملون هيماتوكسيلين", " H&E stain", | |
| ], | |
| "supporting": [ | |
| "توصيات", " recommendations", "متابعة", " follow-up", | |
| "دراسة", " study", "بحث", " research", | |
| ], | |
| }, | |
| "pharmacology": { | |
| "critical": [ | |
| "دواء", " drug", "medicine", "عقار", " pharmaceutical", | |
| "جرعة", " dose", "dosage", "تركيبة", " formulation", | |
| "تأثير جانبي", " side effect", "تفاعل دوائي", " drug interaction", | |
| "مضاد حيوي", " antibiotic", "مسكن", " analgesic", | |
| "مضاد التهاب", " anti-inflammatory", "كورتيزون", " corticosteroid", | |
| "أدوية القلب", " cardiac drugs", "أدوية الضغط", " antihypertensive", | |
| "سيولة الدم", " anticoagulant", "أدوية السكري", " antidiabetic", | |
| "علاج كيميائي", " chemotherapy", "إشعاعي", " radiotherapy", | |
| "دراسة سريرية", " clinical trial", | |
| ], | |
| "important": [ | |
| "صيدلية", " pharmacy", "وصفة طبية", " prescription", | |
| "موانع استعمال", " contraindication", "تحذير", " warning", | |
| ], | |
| "supporting": [ | |
| "تعليمات", " instructions", "معلومات", " information", | |
| "بحث", " research", "دراسة", " study", | |
| ], | |
| }, | |
| "research": { | |
| "critical": [ | |
| "فرضية", " hypothesis", "منهجية", " methodology", | |
| "عينة", " sample", "متغير", " variable", | |
| "دلالة إحصائية", " statistical significance", | |
| "p-value", "confidence interval", "فترة ثقة", | |
| "انحراف معياري", " standard deviation", | |
| "تحليل انحدار", " regression analysis", | |
| "مجلة علمية", " journal", "نشر", " publication", | |
| "مراجعة الأقران", " peer review", "مستخلص", " abstract", | |
| "مرجع", " reference", "استشهاد", " citation", | |
| ], | |
| "important": [ | |
| "بحث", " research", "study", "تحليل", " analysis", | |
| "نتائج", " results", "استنتاج", " conclusion", | |
| "مناقشة", " discussion", "مقدمة", " introduction", | |
| ], | |
| "supporting": [ | |
| "توصيات", " recommendations", "حدود الدراسة", " limitations", | |
| "عمل مستقبلي", " future work", "شكر وتقدير", " acknowledgments", | |
| ], | |
| }, | |
| "engineering": { | |
| "critical": [ | |
| "خوارزمية", " algorithm", "نظام", " system", | |
| "برمجة", " programming", "شبكة عصبية", " neural network", | |
| "تعلم عميق", " deep learning", "تعلم آلي", " machine learning", | |
| "ذكاء اصطناعي", " artificial intelligence", | |
| "واجهة", " interface", "تطبيق", " application", | |
| "قاعدة بيانات", " database", "سيرفر", " server", | |
| "API", "endpoint", "إطار عمل", " framework", | |
| "نموذج", " model", "تدريب", " training", | |
| ], | |
| "important": [ | |
| "Python", "JavaScript", "React", "Node.js", | |
| "Docker", "Kubernetes", "Git", "Linux", | |
| "تصميم", " design", "بنية", " architecture", | |
| "أداء", " performance", "تحسين", " optimization", | |
| ], | |
| "supporting": [ | |
| "كود", " code", "تطوير", " development", | |
| "اختبار", " testing", "نشر", " deployment", | |
| ], | |
| }, | |
| "medical_admin": { | |
| "critical": [ | |
| "سجل طبي", " medical record", "تقرير طبي", " medical report", | |
| "إذن دخول", " admission", "خروج", " discharge", | |
| "إحالة", " referral", "تحويل", " transfer", | |
| "تأمين طبي", " health insurance", "فاتورة", " bill", | |
| "موعد", " appointment", "عيادة", " clinic", | |
| "كشف", " examination", "فحص سريري", " clinical examination", | |
| "السوابق المرضية", " medical history", "التاريخ المرضي", | |
| ], | |
| "important": [ | |
| "مريض", " patient", "طبيب", " doctor", "physician", | |
| "تمريض", " nursing", "مستشفى", " hospital", | |
| "قسم", " department", "جناح", " ward", | |
| ], | |
| "supporting": [ | |
| "ملاحظات", " notes", "متابعة", " follow-up", | |
| "تعليمات", " instructions", "توقيع", " signature", | |
| ], | |
| }, | |
| } | |
| def __init__(self, lexicon_path: Optional[str] = None): | |
| """ | |
| تهيئة مصنف المحتوى الطبي. | |
| Args: | |
| lexicon_path: مسار ملف المعجم الإضافي (JSON) | |
| """ | |
| self.categories: Dict[str, Dict[str, List[str]]] = {} | |
| # تحميل التصنيفات الافتراضية | |
| for cat, data in self._DEFAULT_CATEGORIES.items(): | |
| self.categories[cat] = { | |
| "critical": list(data.get("critical", [])), | |
| "important": list(data.get("important", [])), | |
| "supporting": list(data.get("supporting", [])), | |
| } | |
| # تحميل المعجم الإضافي إذا وُجد | |
| if lexicon_path and os.path.exists(lexicon_path): | |
| self._load_lexicon(lexicon_path) | |
| # تحميل المعجم الجراحي الافتراضي | |
| default_lexicon = os.path.join( | |
| os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), | |
| "data", "ortho_lexicon.json" | |
| ) | |
| if os.path.exists(default_lexicon): | |
| self._load_lexicon(default_lexicon) | |
| # تجميع أنماط regex لكل فئة | |
| self._patterns: Dict[str, Dict[str, List[re.Pattern]]] = {} | |
| self._compile_patterns() | |
| logger.info("تم تهيئة مصنف المحتوى الطبي (%d تصنيف)", len(self.categories)) | |
| def _load_lexicon(self, path: str): | |
| """تحميل معجم إضافي من ملف JSON.""" | |
| try: | |
| with open(path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| if isinstance(data, dict): | |
| for category, keywords in data.items(): | |
| if category not in self.categories: | |
| self.categories[category] = { | |
| "critical": [], "important": [], "supporting": [] | |
| } | |
| if isinstance(keywords, list): | |
| self.categories[category]["critical"].extend(keywords) | |
| elif isinstance(keywords, dict): | |
| for level, words in keywords.items(): | |
| if level in self.categories[category]: | |
| self.categories[category][level].extend(words) | |
| logger.info("تم تحميل المعجم الإضافي: %s", path) | |
| except Exception as e: | |
| logger.warning("فشل تحميل المعجم %s: %s", path, e) | |
| def _compile_patterns(self): | |
| """تحويل الكلمات المفتاحية إلى أنماط regex.""" | |
| for category, levels in self.categories.items(): | |
| self._patterns[category] = {} | |
| for level, keywords in levels.items(): | |
| patterns = [] | |
| for kw in keywords: | |
| try: | |
| patterns.append(re.compile(re.escape(kw), re.IGNORECASE)) | |
| except re.error: | |
| continue | |
| self._patterns[category][level] = patterns | |
| def classify(self, text: str) -> Dict[str, Any]: | |
| """ | |
| تصنيف النص إلى فئة طبية/علمية. | |
| Args: | |
| text: النص المراد تصنيفه | |
| Returns: | |
| قاموس يحتوي على: | |
| - category: الفئة الأساسية | |
| - confidence: مستوى الثقة (0-1) | |
| - scores: درجات جميع الفئات | |
| - keywords_found: الكلمات المفتاحية المكتشفة | |
| - top_keywords: أهم الكلمات المكتشفة | |
| """ | |
| if not text or not text.strip(): | |
| return { | |
| "category": "general", | |
| "confidence": 0.0, | |
| "scores": {}, | |
| "keywords_found": {}, | |
| "top_keywords": [], | |
| } | |
| text_lower = text.lower() | |
| scores: Dict[str, float] = {} | |
| found_keywords: Dict[str, List[str]] = {} | |
| # الأوزان لكل مستوى أهمية | |
| weights = {"critical": 3.0, "important": 2.0, "supporting": 1.0} | |
| for category, levels in self._patterns.items(): | |
| cat_score = 0.0 | |
| cat_keywords: List[str] = [] | |
| for level, patterns in levels.items(): | |
| for pattern in patterns: | |
| matches = pattern.findall(text_lower) | |
| if matches: | |
| weight = weights.get(level, 1.0) | |
| cat_score += len(matches) * weight | |
| cat_keywords.append(pattern.pattern) | |
| if cat_score > 0: | |
| scores[category] = cat_score | |
| found_keywords[category] = cat_keywords | |
| if not scores: | |
| return { | |
| "category": "general", | |
| "confidence": 0.0, | |
| "scores": {}, | |
| "keywords_found": {}, | |
| "top_keywords": [], | |
| } | |
| # تطبيع الدرجات | |
| max_score = max(scores.values()) | |
| normalized = { | |
| k: round(v / max_score, 4) for k, v in scores.items() | |
| } | |
| # اختيار الفئة الأعلى | |
| top_category = max(normalized, key=normalized.get) | |
| top_confidence = normalized[top_category] | |
| # أعلى 10 كلمات مفتاحية | |
| top_keywords = [] | |
| if top_category in found_keywords: | |
| top_keywords = found_keywords[top_category][:10] | |
| return { | |
| "category": top_category, | |
| "confidence": min(round(top_confidence, 4), 1.0), | |
| "scores": normalized, | |
| "keywords_found": found_keywords, | |
| "top_keywords": top_keywords, | |
| } | |
| def classify_with_fallback( | |
| self, | |
| text: str, | |
| min_confidence: float = 0.15 | |
| ) -> Dict[str, Any]: | |
| """ | |
| تصنيف مع مستوى ثقة أدنى. إذا كان أقل من الحد، يُصنف كـ "general". | |
| Args: | |
| text: النص المراد تصنيفه | |
| min_confidence: الحد الأدنى للثقة | |
| Returns: | |
| نتيجة التصنيف | |
| """ | |
| result = self.classify(text) | |
| if result["confidence"] < min_confidence: | |
| result["category"] = "general" | |
| return result | |
| def get_categories(self) -> List[str]: | |
| """عرض قائمة التصنيفات المتاحة.""" | |
| return list(self.categories.keys()) + ["general"] | |
| def add_category( | |
| self, | |
| category: str, | |
| critical: List[str] = None, | |
| important: List[str] = None, | |
| supporting: List[str] = None | |
| ): | |
| """ | |
| إضافة تصنيف جديد. | |
| Args: | |
| category: اسم التصنيف | |
| critical: كلمات حاسمة (وزن 3) | |
| important: كلمات مهمة (وزن 2) | |
| supporting: كلمات مساعدة (وزن 1) | |
| """ | |
| self.categories[category] = { | |
| "critical": critical or [], | |
| "important": important or [], | |
| "supporting": supporting or [], | |
| } | |
| # إعادة تجميع الأنماط | |
| self._patterns[category] = {} | |
| for level, keywords in self.categories[category].items(): | |
| patterns = [] | |
| for kw in keywords: | |
| try: | |
| patterns.append(re.compile(re.escape(kw), re.IGNORECASE)) | |
| except re.error: | |
| continue | |
| self._patterns[category][level] = patterns | |
| logger.info("تمت إضافة تصنيف جديد: %s", category) | |
| def organize_files( | |
| self, | |
| files: List[Dict[str, str]], | |
| base_folder: str, | |
| move_files: bool = False | |
| ) -> Dict[str, List[str]]: | |
| """ | |
| تنظيم الملفات في مجلدات حسب التصنيف. | |
| Args: | |
| files: قائمة قواميس {path: مسار, text: نص مستخرج} | |
| base_folder: المجلد الأساسي | |
| move_files: نقل الملفات (True) أو نسخها (False) | |
| Returns: | |
| قاموس {تصنيف: [مسارات الملفات]} | |
| """ | |
| import shutil | |
| organized: Dict[str, List[str]] = {} | |
| for file_info in files: | |
| filepath = file_info.get("path", "") | |
| text = file_info.get("text", "") | |
| if not filepath or not text: | |
| continue | |
| result = self.classify(text) | |
| category = result["category"] | |
| target_dir = os.path.join(base_folder, category) | |
| os.makedirs(target_dir, exist_ok=True) | |
| filename = os.path.basename(filepath) | |
| target_path = os.path.join(target_dir, filename) | |
| try: | |
| if move_files: | |
| shutil.move(filepath, target_path) | |
| else: | |
| shutil.copy2(filepath, target_path) | |
| if category not in organized: | |
| organized[category] = [] | |
| organized[category].append(target_path) | |
| except Exception as e: | |
| logger.error("خطأ في تنظيم %s: %s", filename, e) | |
| return organized | |