Spaces:
Sleeping
Sleeping
| """ | |
| مصنف النصوص (Text Classifier) | |
| ================================ | |
| يصنف النصوص إلى فئات: برمجية، علمية، أدبية، تقنية، دينية، عامة. | |
| يدعم التصنيف بالكلمات المفتاحية (بدون تحميل نموذج) أو بنموذج HuggingFace. | |
| """ | |
| import logging | |
| import re | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| class TextClassifier: | |
| """ | |
| مصنف النصوص — يدعم تصنيف النصوص العربية والإنجليزية. | |
| الفئات المدعومة: | |
| - code: نصوص برمجية | |
| - scientific: نصوص علمية وأكاديمية | |
| - literary: نصوص أدبية وشعرية | |
| - technical: نصوص تقنية وتكنولوجية | |
| - religious: نصوص دينية | |
| - general: نصوص عامة | |
| الخصائص: | |
| model_name (str, optional): اسم نموذج HuggingFace. | |
| device (str): الجهاز المستخدم (cpu/cuda). | |
| """ | |
| # الكلمات المفتاحية لكل فئة (إنجليزي) | |
| _CATEGORY_KEYWORDS: dict[str, list[str]] = { | |
| "code": [ | |
| "def ", "class ", "import ", "from ", "return ", "function", | |
| "print(", "self.", "if __name__", "async ", "await ", | |
| "var ", "let ", "const ", "console.log", "=> {", "module.exports", | |
| "#include", "int main(", "public class", "void ", | |
| "try:", "except ", "finally:", "raise ", "lambda ", | |
| "SELECT ", "FROM ", "WHERE ", "INSERT INTO", "CREATE TABLE", | |
| "<html", "<div", "<!DOCTYPE", "<script", "</body>", | |
| "git ", "npm ", "pip install", "docker", "kubectl", | |
| "TODO:", "FIXME:", "HACK:", "NOTE:", "XXX:", | |
| ], | |
| "scientific": [ | |
| "hypothesis", "experiment", "analysis", "methodology", | |
| "correlation", "regression", "statistical", "significant", | |
| "peer-reviewed", "journal", "research", "empirical", | |
| "abstract:", "introduction:", "methodology:", "results:", | |
| "discussion:", "conclusion:", "references:", "citation", | |
| "p-value", "standard deviation", "confidence interval", | |
| "الفرضية", "التحليل", "المنهجية", "التجربة", "البحث", | |
| "النتائج", "الاستنتاج", "الإحصائي", "العينة", | |
| ], | |
| "literary": [ | |
| "poem", "poetry", "verse", "stanza", "metaphor", "simile", | |
| "narrative", "prose", "novel", "chapter", "once upon", | |
| "allegory", "rhythm", "rhyme", "sonnet", "haiku", | |
| "قال الشاعر", "قصيدة", "شعر", "بيت", "مقطع", | |
| "كان يا ما كان", "في قديم الزمان", "رواية", "قصة", | |
| "وغنى", "وأمسى", "فجر", "غروب", "تبسمت", | |
| ], | |
| "technical": [ | |
| "algorithm", "architecture", "framework", "protocol", | |
| "specification", "implementation", "optimization", | |
| "scalability", "performance", "throughput", "latency", | |
| "API", "REST", "HTTP", "JSON", "XML", "endpoint", | |
| "database", "server", "client", "deployment", | |
| "الخوارزمية", "البنية", "الإطار", "الأداء", "الخادم", | |
| "قاعدة البيانات", "التطبيق", "البروتوكول", "الشبكة", | |
| ], | |
| "religious": [ | |
| "Quran", "Koran", "Bible", "Torah", "hadith", "sunnah", | |
| "prophet", "revelation", "prayer", "mosque", "church", | |
| "scripture", "verse", "chapter", "surah", "ayah", | |
| "القرآن", "الكريم", "الحديث", "النبوي", "الصلاة", | |
| "مسجد", "سورة", "آية", "الفقه", "التوحيد", "الشريعة", | |
| "رضي الله عنه", "صلى الله عليه وسلم", "بسم الله", | |
| "الحمد لله", "سبحان الله", "الله أكبر", | |
| ], | |
| } | |
| # كلمات برمجة بايثون يجب حمايتها من التصنيف الخاطئ | |
| _PYTHON_KEYWORDS: set[str] = { | |
| "print", "float", "int", "str", "bool", "list", "dict", | |
| "def", "class", "import", "from", "return", "yield", | |
| "if", "else", "elif", "for", "while", "with", "as", | |
| "try", "except", "finally", "raise", "assert", | |
| "lambda", "pass", "break", "continue", "global", | |
| "nonlocal", "async", "await", "True", "False", "None", | |
| "and", "or", "not", "in", "is", "del", | |
| "self", "cls", "super", "property", "staticmethod", | |
| "range", "len", "type", "isinstance", "enumerate", | |
| "zip", "map", "filter", "sorted", "reversed", | |
| } | |
| def __init__( | |
| self, | |
| model_name: Optional[str] = None, | |
| device: str = "cpu", | |
| ) -> None: | |
| """ | |
| تهيئة مصنف النصوص. | |
| المعاملات: | |
| model_name: اسم نموذج HuggingFace (اختياري). | |
| مثال: "facebook/bart-large-mnli" | |
| device: الجهاز المستخدم ('cpu' أو 'cuda'). | |
| """ | |
| self.model_name = model_name | |
| self.device = device | |
| self._pipeline = None | |
| self._model_available = False | |
| # إعداد أنماط المطابقة لكل فئة | |
| self._category_patterns: dict[str, list[re.Pattern]] = {} | |
| self._compile_patterns() | |
| # محاولة تحميل النموذج (كسول) | |
| if model_name: | |
| self._try_load_model() | |
| def _compile_patterns(self) -> None: | |
| """تحويل الكلمات المفتاحية إلى أنماط regex.""" | |
| for category, keywords in self._CATEGORY_KEYWORDS.items(): | |
| patterns: list[re.Pattern] = [] | |
| for kw in keywords: | |
| try: | |
| patterns.append(re.compile(re.escape(kw), re.IGNORECASE)) | |
| except re.error: | |
| logger.debug("نمط غير صالح: %s", kw) | |
| self._category_patterns[category] = patterns | |
| def _try_load_model(self) -> None: | |
| """محاولة تحميل نموذج HuggingFace للتصنيف.""" | |
| try: | |
| from transformers import pipeline # type: ignore | |
| logger.info("جاري تحميل نموذج التصنيف: %s ...", self.model_name) | |
| self._pipeline = pipeline( | |
| "zero-shot-classification", | |
| model=self.model_name, | |
| device=self.device, | |
| ) | |
| self._model_available = True | |
| logger.info("تم تحميل نموذج التصنيف بنجاح") | |
| except ImportError: | |
| logger.warning( | |
| "مكتبة transformers غير مثبتة. سيتم الاعتماد على التصنيف بالكلمات المفتاحية. " | |
| "pip install transformers torch" | |
| ) | |
| except Exception as e: | |
| logger.warning("فشل تحميل النموذج '%s': %s", self.model_name, e) | |
| # ------------------------------------------------------------------ | |
| # التصنيف بالكلمات المفتاحية (يعمل دائماً) | |
| # ------------------------------------------------------------------ | |
| def _keyword_classify(self, text: str) -> dict: | |
| """ | |
| تصنيف النص بناءً على الكلمات المفتاحية. | |
| المعاملات: | |
| text: النص المراد تصنيفه. | |
| العائد: | |
| قاموس: category, confidence, keywords_found, scores | |
| """ | |
| scores: dict[str, float] = {} | |
| found_keywords: dict[str, list[str]] = {} | |
| for category, patterns in self._category_patterns.items(): | |
| cat_score = 0.0 | |
| cat_keywords: list[str] = [] | |
| for pattern in patterns: | |
| matches = pattern.findall(text) | |
| if matches: | |
| cat_score += len(matches) | |
| cat_keywords.append(pattern.pattern) | |
| if cat_score > 0: | |
| scores[category] = cat_score | |
| found_keywords[category] = cat_keywords | |
| if not scores: | |
| return { | |
| "category": "general", | |
| "confidence": 0.3, | |
| "keywords_found": {}, | |
| "scores": {}, | |
| "method": "keyword", | |
| } | |
| # تطبيع الدرجات | |
| total_score = sum(scores.values()) | |
| normalized = {k: round(v / total_score, 4) for k, v in scores.items()} | |
| # اختيار الفئة الأعلى درجة | |
| top_category = max(normalized, key=normalized.get) # type: ignore | |
| top_confidence = normalized[top_category] | |
| # إذا كانت أعلى درجة منخفضة → عامة | |
| if top_confidence < 0.15: | |
| return { | |
| "category": "general", | |
| "confidence": round(top_confidence, 4), | |
| "keywords_found": found_keywords, | |
| "scores": normalized, | |
| "method": "keyword", | |
| } | |
| return { | |
| "category": top_category, | |
| "confidence": round(top_confidence, 4), | |
| "keywords_found": found_keywords, | |
| "scores": normalized, | |
| "method": "keyword", | |
| } | |
| # ------------------------------------------------------------------ | |
| # التصنيف بالنموذج (إذا توفر) | |
| # ------------------------------------------------------------------ | |
| def _model_classify(self, text: str) -> dict: | |
| """ | |
| تصنيف النص باستخدام نموذج HuggingFace. | |
| المعاملات: | |
| text: النص المراد تصنيفه. | |
| العائد: | |
| قاموس نتيجة التصنيف. | |
| """ | |
| candidate_labels = [ | |
| "programming code", "scientific research", "literature poetry", | |
| "technical documentation", "religious text", "general writing", | |
| ] | |
| label_map = { | |
| "programming code": "code", | |
| "scientific research": "scientific", | |
| "literature poetry": "literary", | |
| "technical documentation": "technical", | |
| "religious text": "religious", | |
| "general writing": "general", | |
| } | |
| try: | |
| result = self._pipeline(text, candidate_labels=candidate_labels) | |
| labels = result.get("labels", []) | |
| scores = result.get("scores", []) | |
| if not labels: | |
| return self._keyword_classify(text) | |
| top_label = labels[0] | |
| top_score = scores[0] | |
| # تحويل التسميات إلى الفئات الداخلية | |
| mapped_scores: dict[str, float] = {} | |
| for label, score in zip(labels, scores): | |
| mapped = label_map.get(label, "general") | |
| mapped_scores[mapped] = max( | |
| mapped_scores.get(mapped, 0.0), score | |
| ) | |
| category = label_map.get(top_label, "general") | |
| return { | |
| "category": category, | |
| "confidence": round(top_score, 4), | |
| "keywords_found": {}, | |
| "scores": {k: round(v, 4) for k, v in mapped_scores.items()}, | |
| "method": "model", | |
| } | |
| except Exception as e: | |
| logger.warning("فشل التصنيف بالنموذج: %s — يتم الرجوع للكلمات المفتاحية", e) | |
| return self._keyword_classify(text) | |
| # ------------------------------------------------------------------ | |
| # الواجهة العامة | |
| # ------------------------------------------------------------------ | |
| def classify(self, text: str) -> dict: | |
| """ | |
| تصنيف النص إلى فئة. | |
| المعاملات: | |
| text: النص المراد تصنيفه. | |
| العائد: | |
| قاموس يحتوي على: | |
| - category (str): الفئة (code/scientific/literary/technical/religious/general) | |
| - confidence (float): مستوى الثقة [0-1] | |
| - keywords_found (dict): الكلمات المفتاحية التي تم العثور عليها | |
| - scores (dict): درجات جميع الفئات | |
| - method (str): طريقة التصنيف (keyword/model) | |
| """ | |
| if not text or not text.strip(): | |
| return { | |
| "category": "general", | |
| "confidence": 0.0, | |
| "keywords_found": {}, | |
| "scores": {}, | |
| "method": "none", | |
| } | |
| cleaned = text.strip() | |
| # إذا توفر النموذج → استخدامه | |
| if self._model_available and self._pipeline is not None: | |
| return self._model_classify(cleaned) | |
| # خلاف ذلك → كلمات مفتاحية | |
| return self._keyword_classify(cleaned) | |
| def classify_document(self, document_text: str) -> dict: | |
| """ | |
| تصنيف مستند كامل. | |
| يقسم المستند إلى أقسام ويصنف كل قسم، | |
| ثم يجمع النتائج لتحديد الفئة العامة. | |
| المعاملات: | |
| document_text: نص المستند الكامل. | |
| العائد: | |
| قاموس يحتوي على: | |
| - category: الفئة السائدة | |
| - confidence: الثقة الإجمالية | |
| - section_results: نتائج الأقسام | |
| - dominant_sections: الفئات الأكثر شيوعاً | |
| """ | |
| if not document_text or not document_text.strip(): | |
| return { | |
| "category": "general", | |
| "confidence": 0.0, | |
| "section_results": [], | |
| "dominant_sections": {}, | |
| } | |
| # تقسيم المستند إلى أقسام حسب الفقرات | |
| sections = re.split(r"\n\s*\n", document_text.strip()) | |
| sections = [s.strip() for s in sections if len(s.strip()) > 20] | |
| if not sections: | |
| return self.classify(document_text) | |
| section_results: list[dict] = [] | |
| category_counts: dict[str, float] = {} | |
| for i, section in enumerate(sections): | |
| result = self.classify(section) | |
| result["section_index"] = i | |
| section_results.append(result) | |
| cat = result["category"] | |
| conf = result.get("confidence", 0.0) | |
| category_counts[cat] = category_counts.get(cat, 0.0) + conf | |
| # الفئة السائدة | |
| if category_counts: | |
| dominant = max(category_counts, key=category_counts.get) # type: ignore | |
| total_conf = sum(category_counts.values()) | |
| overall_conf = round(category_counts[dominant] / max(total_conf, 1), 4) | |
| else: | |
| dominant = "general" | |
| overall_conf = 0.0 | |
| # ترتيب الفئات حسب الشيوع | |
| sorted_categories = dict( | |
| sorted(category_counts.items(), key=lambda x: x[1], reverse=True) | |
| ) | |
| return { | |
| "category": dominant, | |
| "confidence": overall_conf, | |
| "section_results": section_results, | |
| "dominant_sections": sorted_categories, | |
| "total_sections": len(sections), | |
| "method": "document", | |
| } | |
| def get_categories(self) -> list[str]: | |
| """ | |
| عرض قائمة الفئات المدعومة. | |
| العائد: | |
| قائمة بأسماء الفئات. | |
| """ | |
| return list(self._CATEGORY_KEYWORDS.keys()) + ["general"] | |