""" حامي الأكواد البرمجية (Code Protector) ======================================== يحمي ملفات وأجزاء الأكواد البرمجية من التعديل العرضي أثناء المعالجة النصية (مثل فحص الإملاء أو تحويل النصوص). القدرات: - كشف ملفات الأكواد من خلال الامتداد والمحتوى - استخراج أجزاء الأكواد من المستندات المختلطة - لف أجزاء الأكواد بعلامات حماية لمنع تعديلها - كشف لغة البرمجة من الملف أو النص - إزالة علامات الحماية عند الحاجة """ import logging import re from pathlib import Path from typing import Optional logger = logging.getLogger(__name__) class CodeProtector: """ حامي الأكواد البرمجية — يمنع تعديل الأكواد أثناء معالجة النصوص. الاستخدام: protector = CodeProtector() protected = protector.protect_text(mixed_text) # ... معالجة نصية ... restored = protector.strip_protection(protected) """ # ======== علامات الحماية ======== MARKER_START: str = "«CODE_PROTECT_START»" MARKER_END: str = "«CODE_PROTECT_END»" LANG_TAG_START: str = "«LANG:" LANG_TAG_END: str = "»" # ======== الامتدادات المدعومة ======== CODE_EXTENSIONS: set[str] = { ".py", ".pyw", ".pyi", ".js", ".jsx", ".mjs", ".cjs", ".ts", ".tsx", ".java", ".kt", ".kts", ".scala", ".c", ".h", ".cpp", ".hpp", ".cc", ".cxx", ".cs", ".vb", ".go", ".rs", ".swift", ".rb", ".rake", ".php", ".pl", ".pm", ".r", ".lua", ".vim", ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".dart", ".clj", ".ex", ".exs", ".hs", ".ml", ".lisp", ".m", ".mm", ".proto", ".thrift", ".cmake", ".vue", ".svelte", ".html", ".htm", ".css", ".scss", ".sass", ".less", ".xml", ".xsl", ".xsd", ".json", ".yaml", ".yml", ".toml", ".graphql", ".gql", ".dockerfile", } # ======== كلمات محجوزة حسب اللغة ======== RESERVED_KEYWORDS: dict[str, set[str]] = { "python": { "def", "class", "import", "from", "return", "if", "elif", "else", "for", "while", "try", "except", "finally", "with", "as", "lambda", "yield", "async", "await", "pass", "break", "continue", "raise", "global", "nonlocal", "assert", "del", "in", "not", "and", "or", "is", "None", "True", "False", "self", "print", "range", "type", "super", "property", "staticmethod", "classmethod", "enumerate", "__init__", "__name__", "__main__", }, "javascript": { "function", "const", "let", "var", "return", "if", "else", "for", "while", "do", "switch", "case", "break", "continue", "try", "catch", "finally", "throw", "new", "this", "class", "extends", "super", "import", "export", "default", "from", "async", "await", "yield", "of", "in", "typeof", "instanceof", "null", "undefined", "true", "false", "console", "require", "module", "exports", "document", "window", "function", "arrow", "=>", }, "java": { "public", "private", "protected", "static", "final", "abstract", "class", "interface", "extends", "implements", "import", "package", "return", "if", "else", "for", "while", "do", "switch", "case", "break", "continue", "try", "catch", "finally", "throw", "throws", "new", "this", "super", "void", "int", "long", "double", "float", "boolean", "char", "byte", "short", "String", "null", "true", "false", "System", "Override", }, "cpp": { "include", "define", "ifdef", "ifndef", "endif", "pragma", "class", "struct", "enum", "union", "namespace", "using", "template", "typename", "virtual", "override", "const", "static", "extern", "inline", "explicit", "friend", "public", "private", "protected", "return", "if", "else", "for", "while", "do", "switch", "case", "break", "continue", "try", "catch", "throw", "new", "delete", "this", "auto", "int", "long", "double", "float", "bool", "char", "void", "std", "cout", "cin", "endl", "nullptr", "sizeof", }, "go": { "package", "import", "func", "var", "const", "type", "struct", "interface", "map", "chan", "go", "select", "range", "for", "if", "else", "switch", "case", "default", "break", "continue", "return", "defer", "fallthrough", "nil", "true", "false", "make", "append", "len", "cap", "fmt", "Println", "Printf", "Errorf", }, "rust": { "fn", "let", "mut", "const", "static", "struct", "enum", "impl", "trait", "pub", "use", "mod", "crate", "self", "super", "return", "if", "else", "for", "while", "loop", "match", "break", "continue", "where", "as", "in", "ref", "true", "false", "None", "Some", "Ok", "Err", "Vec", "String", "println", "eprintln", "format", "macro_rules", "derive", "async", "await", "move", "unsafe", }, "sql": { "SELECT", "FROM", "WHERE", "INSERT", "INTO", "UPDATE", "DELETE", "CREATE", "ALTER", "DROP", "TABLE", "INDEX", "JOIN", "LEFT", "RIGHT", "INNER", "OUTER", "ON", "AND", "OR", "NOT", "NULL", "IS", "IN", "LIKE", "BETWEEN", "ORDER", "BY", "GROUP", "HAVING", "LIMIT", "OFFSET", "AS", "DISTINCT", "UNION", "ALL", "EXISTS", "COUNT", "SUM", "AVG", "MIN", "MAX", "PRIMARY", "KEY", "FOREIGN", "REFERENCES", "CASCADE", "SET", "VALUES", "BEGIN", "COMMIT", "ROLLBACK", "TRANSACTION", }, } # ======== أنماط كشف أجزاء الأكواد في النصوص ======== # أنماط Markdown code blocks MD_CODE_BLOCK_PATTERN: re.Pattern = re.compile( r"(```[\w]*\n)(.*?)(```)", re.DOTALL, ) # أنماط HTML/PHP code blocks HTML_CODE_BLOCK_PATTERN: re.Pattern = re.compile( r"<(code|pre)[^>]*>(.*?)", re.DOTALL | re.IGNORECASE, ) def __init__(self) -> None: """تهيئة حامي الأكواد.""" self._protected_sections: list[dict] = [] # لتخزين الأجزاء المحمية مؤقتاً logger.info("تم تهيئة حامي الأكواد البرمجية") # =================================================================== # كشف ملفات الأكواد # =================================================================== def is_code_file(self, file_path: str | Path) -> bool: """ يتحقق مما إذا كان الملف ملف أكواد برمجية. المعاملات: file_path: مسار الملف المعاد: True إذا كان الملف ملف أكواد، وإلا False """ path = Path(file_path) suffix = path.suffix.lower() # فحص الامتداد if suffix in self.CODE_EXTENSIONS: logger.debug("كشف ملف أكواد بالامتداد: %s", path.name) return True # فحص الاسم الخاص name_lower = path.name.lower() special_code_names = { "makefile", "dockerfile", "rakefile", "gemfile", "cmakelists.txt", } if name_lower in special_code_names: return True # فحص المحتوى — إذا كان يحتوي على عدد كبير من الكلمات المحجوزة try: content = path.read_text(encoding="utf-8", errors="ignore")[:2000] if self._count_code_indicators(content) >= 5: logger.debug("كشف ملف أكواد بالمحتوى: %s", path.name) return True except PermissionError: logger.warning("لا صلاحية لقراءة: %s", path) except OSError as exc: logger.warning("خطأ أثناء قراءة %s: %s", path, exc) return False def _count_code_indicators(self, text: str) -> int: """ يعدّد مؤشرات الأكواد في النص (أقواس، عوامل، كلمات محجوزة). المعاملات: text: النص المراد فحصه المعاد: عدد المؤشرات المكتشفة """ count = 0 # علامات ترقيم برمجية شائعة code_symbols = ["{", "}", ";", "=>", "->", "::", "#!", "def ", "func ", "class "] for symbol in code_symbols: count += text.count(symbol) # كلمات محجوزة من جميع اللغات all_keywords: set[str] = set() for keywords in self.RESERVED_KEYWORDS.values(): all_keywords.update(keywords) words = re.findall(r"\b\w+\b", text) for word in words: if word in all_keywords: count += 1 return count # =================================================================== # كشف اللغة # =================================================================== def detect_language(self, source: str | Path) -> str: """ يكشف لغة البرمجة من مسار ملف أو نص. المعاملات: source: مسار الملف أو النص البرمجي المعاد: اسم اللغة (مثل 'python', 'javascript', ...) أو 'unknown' """ # إذا كان مصدراً هو مسار ملف path = Path(source) if path.is_file(): return self._detect_language_by_path(path) # إذا كان نصاً text = str(source) return self._detect_language_by_content(text) def _detect_language_by_path(self, path: Path) -> str: """يكتشف اللغة من مسار الملف.""" ext_map: dict[str, str] = { ".py": "python", ".pyw": "python", ".pyi": "python", ".js": "javascript", ".jsx": "javascript", ".mjs": "javascript", ".ts": "typescript", ".tsx": "typescript", ".java": "java", ".kt": "kotlin", ".kts": "kotlin", ".scala": "scala", ".c": "c", ".h": "c", ".cpp": "cpp", ".hpp": "cpp", ".cc": "cpp", ".cxx": "cpp", ".cs": "csharp", ".vb": "vb", ".go": "go", ".rs": "rust", ".swift": "swift", ".rb": "ruby", ".rake": "ruby", ".php": "php", ".pl": "perl", ".pm": "perl", ".r": "r", ".lua": "lua", ".sh": "shell", ".bash": "shell", ".zsh": "shell", ".ps1": "powershell", ".bat": "batch", ".sql": "sql", ".dart": "dart", ".clj": "clojure", ".ex": "elixir", ".exs": "elixir", ".hs": "haskell", ".ml": "ocaml", ".html": "html", ".htm": "html", ".css": "css", ".scss": "scss", ".sass": "scss", ".less": "less", ".json": "json", ".yaml": "yaml", ".yml": "yaml", ".toml": "toml", ".xml": "xml", ".xsl": "xsl", ".vue": "vue", ".svelte": "svelte", ".proto": "protobuf", ".ipynb": "python", ".dockerfile": "docker", } suffix = path.suffix.lower() lang = ext_map.get(suffix) if lang: logger.debug("كشف لغة %s بالامتداد: %s", path.name, lang) return lang # أسماء ملفات خاصة name_lower = path.name.lower() special: dict[str, str] = { "makefile": "makefile", "dockerfile": "docker", "rakefile": "ruby", "gemfile": "ruby", } lang = special.get(name_lower, "unknown") logger.debug("كشف لغة %s: %s", path.name, lang) return lang def _detect_language_by_content(self, text: str) -> str: """يكتشف اللغة من خلال تحليل محتوى النص.""" scores: dict[str, int] = {} for lang, keywords in self.RESERVED_KEYWORDS.items(): score = 0 for kw in keywords: # استخدام حدود الكلمات لضمان الدقة pattern = rf"\b{re.escape(kw)}\b" matches = re.findall(pattern, text, re.IGNORECASE) score += len(matches) if score > 0: scores[lang] = score if not scores: return "unknown" best_lang = max(scores, key=scores.get) logger.debug( "كشف لغة بالمحتوى: %s (النتيجة: %d)", best_lang, scores[best_lang], ) return best_lang # =================================================================== # استخراج أجزاء الأكواد # =================================================================== def extract_code_blocks(self, text: str) -> list[dict]: """ يستخرج أجزاء الأكواد من النصوص المختلطة. يدعم: - أجزاء Markdown (```code```) - أجزاء HTML (... أو
...
) - الأجزاء المُغلفة بعلامات الحماية المعاملات: text: النص المختلط المعاد: قائمة بقواميس: [{"start": int, "end": int, "language": str, "code": str}, ...] """ blocks: list[dict] = [] # 1) أجزاء Markdown for match in self.MD_CODE_BLOCK_PATTERN.finditer(text): lang_tag = match.group(1).strip().lstrip("`").strip() language = lang_tag if lang_tag else "unknown" blocks.append({ "start": match.start(), "end": match.end(), "language": language, "code": match.group(2).strip(), }) # 2) أجزاء HTML for match in self.HTML_CODE_BLOCK_PATTERN.finditer(text): # تجنب التكرار مع أجزاء Markdown if any( b["start"] <= match.start() <= b["end"] for b in blocks ): continue blocks.append({ "start": match.start(), "end": match.end(), "language": "html", "code": match.group(2).strip(), }) # 3) أجزاء محمية بعلاماتنا marker_pattern = re.compile( re.escape(self.MARKER_START) + re.escape(self.LANG_TAG_START) + r"(\w+)" + re.escape(self.LANG_TAG_END) + r"(.*?)" + re.escape(self.MARKER_END), re.DOTALL, ) for match in marker_pattern.finditer(text): if any( b["start"] <= match.start() <= b["end"] for b in blocks ): continue blocks.append({ "start": match.start(), "end": match.end(), "language": match.group(1), "code": match.group(2).strip(), }) # ترتيب حسب الموضع blocks.sort(key=lambda b: b["start"]) logger.debug("تم استخراج %d جزء أكواد من النص", len(blocks)) return blocks # =================================================================== # حماية النص # =================================================================== def protect_text(self, text: str) -> str: """ يلف أجزاء الأكواد في النص بعلامات حماية لمنع تعديلها أثناء المعالجة النصية. المعاملات: text: النص المختلط (يحتوي على أكواد ومحتوى عادي) المعاد: النص المحميّ — أجزاء الأكواد ملفوفة بعلامات الحماية """ self._protected_sections = [] # 1) حماية أجزاء Markdown def protect_md_block(match: re.Match) -> str: lang_tag = match.group(1).strip().lstrip("`").strip() language = lang_tag if lang_tag else "unknown" code = match.group(2).strip() return ( f"{self.MARKER_START}{self.LANG_TAG_START}" f"{language}{self.LANG_TAG_END}" f"{code}{self.MARKER_END}" ) protected = self.MD_CODE_BLOCK_PATTERN.sub(protect_md_block, text) # 2) حماية أجزاء HTML def protect_html_block(match: re.Match) -> str: code = match.group(2).strip() return ( f"{self.MARKER_START}{self.LANG_TAG_START}" f"html{self.LANG_TAG_END}" f"{code}{self.MARKER_END}" ) protected = self.HTML_CODE_BLOCK_PATTERN.sub(protect_html_block, protected) # 3) تخزين الأجزاء المحمية للرجوع إليها self._protected_sections = self.extract_code_blocks(protected) logger.info("تم حماية %d جزء أكواد في النص", len(self._protected_sections)) return protected def strip_protection(self, text: str, format_: str = "raw") -> str: """ يزيل علامات الحماية من النص ويعيد الأكواد بصيغتها الأصلية. المعاملات: text: النص المحميّ format_: صيغة الإخراج: - 'raw': نص عادي بدون علامات - 'markdown': أجزاء Markdown (```) - 'html': أجزاء HTML (...) المعاد: النص بعد إزالة الحماية بالصيغة المطلوبة """ marker_pattern = re.compile( re.escape(self.MARKER_START) + re.escape(self.LANG_TAG_START) + r"(\w+)" + re.escape(self.LANG_TAG_END) + r"(.*?)" + re.escape(self.MARKER_END), re.DOTALL, ) if format_ == "raw": # إزالة العلامات فقط — إبقاء الكود كما هو result = marker_pattern.sub(r"\2", text) elif format_ == "markdown": def to_md(match: re.Match) -> str: lang = match.group(1) code = match.group(2) return f"```{lang}\n{code}\n```" result = marker_pattern.sub(to_md, text) elif format_ == "html": def to_html(match: re.Match) -> str: code = match.group(2) return f"{code}" result = marker_pattern.sub(to_html, text) else: raise ValueError( f"صيغة غير مدعومة: {format_}. القيم المقبولة: raw, markdown, html" ) logger.debug("تم إزالة الحماية (الصيغة: %s)", format_) return result # =================================================================== # منع التدقيق الإملائي على الأكواد # =================================================================== def is_protected_section(self, text: str, position: int) -> bool: """ يتحقق مما إذا كان الموضع المحدد يقع داخل جزء محمي من الأكواد. المعاملات: text: النص position: الموضع المطلوب فحصه المعاد: True إذا كان الموضع داخل جزء محمي """ marker_pattern = re.compile( re.escape(self.MARKER_START) + re.escape(self.LANG_TAG_START) + r"\w+" + re.escape(self.LANG_TAG_END) + r".*?" + re.escape(self.MARKER_END), re.DOTALL, ) for match in marker_pattern.finditer(text): if match.start() <= position <= match.end(): return True return False def get_protected_ranges(self, text: str) -> list[tuple[int, int]]: """ يعرض قائمة بمجالات الأجزاء المحمية في النص. المعاملات: text: النص المراد فحصه المعاد: قائمة بالأزواج (بداية، نهاية) لكل جزء محمي """ marker_pattern = re.compile( re.escape(self.MARKER_START) + r".*?" + re.escape(self.MARKER_END), re.DOTALL, ) ranges = [(m.start(), m.end()) for m in marker_pattern.finditer(text)] logger.debug("عدد المجالات المحمية: %d", len(ranges)) return ranges # =================================================================== # أدوات مساعدة # =================================================================== def get_protected_keywords(self, language: Optional[str] = None) -> dict[str, set[str]]: """ يعرض الكلمات المحجوزة. المعاملات: language: إذا حددت، يعرض كلمات لغة واحدة فقط المعاد: قاموس الكلمات المحجوزة (أو مجموعة واحدة إذا حددت لغة) """ if language: lang_lower = language.lower() if lang_lower in self.RESERVED_KEYWORDS: return {lang_lower: self.RESERVED_KEYWORDS[lang_lower]} logger.warning("لغة غير مدعومة: %s", language) return {} return dict(self.RESERVED_KEYWORDS)