""" ================================================ Arabic Diacritization - mishkala نموذج التشكيل العربي التلقائي https://huggingface.co/flokymind/mishkala ================================================ المتطلبات: pip install torch pytorch-crf huggingface_hub ================================================ """ # ── المتطلبات ───────────────────────────────── import subprocess, sys def install(pkg): subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"]) try: import torchcrf except ImportError: install("pytorch-crf") try: from huggingface_hub import hf_hub_download except ImportError: install("huggingface_hub") # ── الاستيرادات ─────────────────────────────── import json, math, re from pathlib import Path from typing import Dict import torch import torch.nn as nn import torch.nn.functional as F from torchcrf import CRF from huggingface_hub import hf_hub_download # ════════════════════════════════════════════ # 1. الثوابت # ════════════════════════════════════════════ REPO_ID = "flokymind/mishkala" DIACRITICS_SET = { '\u064e', '\u064b', '\u064f', '\u064c', '\u0650', '\u064d', '\u0651', '\u0652', } SPECIAL_TOKENS = {'PAD': 0, 'UNK': 1, 'BOS': 2, 'EOS': 3, 'MASK': 4, ' ': 5} DIACRITIC_CLASSES = [ 'NO_DIACRITIC', 'FATHA', 'FATHATAN', 'DAMMA', 'DAMMATAN', 'KASRA', 'KASRATAN', 'SUKUN', 'SHADDA', 'SHADDA_FATHA', 'SHADDA_FATHATAN', 'SHADDA_DAMMA', 'SHADDA_DAMMATAN', 'SHADDA_KASRA', 'SHADDA_KASRATAN', ] DIACRITIC_MAP = { 'NO_DIACRITIC': '', 'FATHA': '\u064e', 'FATHATAN': '\u064b', 'DAMMA': '\u064f', 'DAMMATAN': '\u064c', 'KASRA': '\u0650', 'KASRATAN': '\u064d', 'SUKUN': '\u0652', 'SHADDA': '\u0651', 'SHADDA_FATHA': '\u0651\u064e', 'SHADDA_FATHATAN': '\u0651\u064b', 'SHADDA_DAMMA': '\u0651\u064f', 'SHADDA_DAMMATAN': '\u0651\u064c', 'SHADDA_KASRA': '\u0651\u0650', 'SHADDA_KASRATAN': '\u0651\u064d', } # ════════════════════════════════════════════ # 2. التوكنايزر # ════════════════════════════════════════════ class ArabicTokenizer: def __init__(self): self.char_to_id: Dict[str, int] = {} self.id_to_char: Dict[int, str] = {} self.vocab_size: int = 0 def encode(self, text, max_length=512, padding=True): ids = [SPECIAL_TOKENS['BOS']] for ch in text: if ch in DIACRITICS_SET: continue ids.append(self.char_to_id.get(ch, SPECIAL_TOKENS['UNK'])) ids.append(SPECIAL_TOKENS['EOS']) attention_mask = [1] * len(ids) if len(ids) > max_length: ids = ids[:max_length] attention_mask = attention_mask[:max_length] elif padding: pad_len = max_length - len(ids) ids += [SPECIAL_TOKENS['PAD']] * pad_len attention_mask += [0] * pad_len return ids, attention_mask @classmethod def load(cls, path): data = json.loads(Path(path).read_text(encoding='utf-8')) tok = cls() tok.char_to_id = data['char_to_id'] tok.id_to_char = {int(v): k for k, v in data['char_to_id'].items()} tok.vocab_size = data['vocab_size'] print(f"✅ التوكنايزر: {tok.vocab_size} رمز") return tok # ════════════════════════════════════════════ # 3. مكونات النموذج # ════════════════════════════════════════════ class RMSNorm(nn.Module): def __init__(self, dim, eps=1e-6): super().__init__() self.eps = eps self.scale = nn.Parameter(torch.ones(dim)) def forward(self, x): rms = x.pow(2).mean(-1, keepdim=True).add(self.eps).sqrt() return self.scale * x / rms class RotaryEmbedding(nn.Module): def __init__(self, dim, max_seq_len=4096): super().__init__() inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) self.register_buffer('inv_freq', inv_freq) t = torch.arange(max_seq_len).float() freqs = torch.outer(t, inv_freq) emb = torch.cat([freqs, freqs], dim=-1) self.register_buffer('cos_cached', emb.cos()) self.register_buffer('sin_cached', emb.sin()) def forward(self, x, seq_len): return ( self.cos_cached[:seq_len].unsqueeze(0), self.sin_cached[:seq_len].unsqueeze(0), ) def rotate_half(x): x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:] return torch.cat([-x2, x1], dim=-1) def apply_rope(q, k, cos, sin): return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) class SwiGLU(nn.Module): def __init__(self, dim, expansion=4): super().__init__() hidden = int(dim * expansion * 2 / 3) hidden = (hidden + 7) // 8 * 8 self.gate_proj = nn.Linear(dim, hidden, bias=False) self.up_proj = nn.Linear(dim, hidden, bias=False) self.down_proj = nn.Linear(hidden, dim, bias=False) def forward(self, x): return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x)) class MambaBlock(nn.Module): def __init__(self, dim, d_state=16, d_conv=4, expand=2): super().__init__() self.d_inner = int(dim * expand) self.in_proj = nn.Linear(dim, self.d_inner * 2, bias=False) self.conv1d = nn.Conv1d(self.d_inner, self.d_inner, d_conv, padding=d_conv-1, groups=self.d_inner, bias=True) self.out_proj = nn.Linear(self.d_inner, dim, bias=False) self.x_proj = nn.Linear(self.d_inner, d_state * 2 + 1, bias=False) self.dt_proj = nn.Linear(1, self.d_inner, bias=True) A = torch.arange(1, d_state+1).float().unsqueeze(0).expand(self.d_inner, -1) self.A_log = nn.Parameter(torch.log(A)) self.D = nn.Parameter(torch.ones(self.d_inner)) self.norm = RMSNorm(dim) def ssm(self, x): dt = F.softplus(self.dt_proj(self.x_proj(x)[..., :1])) return x * self.D + torch.cumsum(x * dt, dim=1) * 0.1 def forward(self, x): residual = x x = self.norm(x) xz = self.in_proj(x) x_ssm, z = xz.chunk(2, dim=-1) x_conv = self.conv1d(x_ssm.transpose(1,2))[..., :x_ssm.shape[1]].transpose(1,2) y = self.ssm(F.silu(x_conv)) * F.silu(z) return self.out_proj(y) + residual class TransformerBlock(nn.Module): def __init__(self, dim, n_heads, max_len=4096, dropout=0.1): super().__init__() self.n_heads = n_heads self.head_dim = dim // n_heads self.q_proj = nn.Linear(dim, dim, bias=False) self.k_proj = nn.Linear(dim, dim, bias=False) self.v_proj = nn.Linear(dim, dim, bias=False) self.o_proj = nn.Linear(dim, dim, bias=False) self.rope = RotaryEmbedding(self.head_dim, max_len) self.ffn = SwiGLU(dim) self.norm1 = RMSNorm(dim) self.norm2 = RMSNorm(dim) self.dropout = nn.Dropout(dropout) def attention(self, x, mask=None): B, L, D = x.shape q = self.q_proj(x).view(B,L,self.n_heads,self.head_dim).transpose(1,2) k = self.k_proj(x).view(B,L,self.n_heads,self.head_dim).transpose(1,2) v = self.v_proj(x).view(B,L,self.n_heads,self.head_dim).transpose(1,2) cos, sin = self.rope(x, L) cos = cos.unsqueeze(1).expand_as(q) sin = sin.unsqueeze(1).expand_as(q) q, k = apply_rope(q, k, cos, sin) scores = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(self.head_dim) if mask is not None: scores = scores.masked_fill( ~mask.unsqueeze(1).unsqueeze(2).bool(), float('-inf') ) attn = self.dropout(F.softmax(scores, dim=-1)) out = torch.matmul(attn, v).transpose(1,2).contiguous().view(B,L,D) return self.o_proj(out) def forward(self, x, mask=None): x = x + self.dropout(self.attention(self.norm1(x), mask)) x = x + self.dropout(self.ffn(self.norm2(x))) return x class ArabicDiacritizerModel(nn.Module): def __init__(self, vocab_size=50, dim=320, mamba_layers=4, transformer_layers=8, n_heads=8, num_labels=15, max_seq_len=4096, dropout=0.15, d_state=16): super().__init__() self.num_labels = num_labels self.embedding = nn.Embedding(vocab_size, dim, padding_idx=0) self.emb_norm = RMSNorm(dim) self.dropout = nn.Dropout(dropout) self.mamba_layers = nn.ModuleList([ MambaBlock(dim, d_state) for _ in range(mamba_layers) ]) self.transformer_layers = nn.ModuleList([ TransformerBlock(dim, n_heads, max_seq_len, dropout) for _ in range(transformer_layers) ]) self.final_norm = RMSNorm(dim) self.classifier = nn.Linear(dim, num_labels) self.crf = CRF(num_labels, batch_first=True) def forward(self, input_ids, attention_mask=None, labels=None): x = self.dropout(self.emb_norm(self.embedding(input_ids))) for m in self.mamba_layers: x = m(x) for t in self.transformer_layers: x = t(x, attention_mask) emissions = self.classifier(self.final_norm(x)) mask = (attention_mask.bool() if attention_mask is not None else torch.ones(emissions.shape[:2], dtype=torch.bool, device=emissions.device)) return { 'predictions': self.crf.decode(emissions, mask=mask), 'emissions': emissions, } # ════════════════════════════════════════════ # 4. تحميل النموذج من HuggingFace # ════════════════════════════════════════════ def load_mishkala(repo_id: str = REPO_ID, device: str = None): """ تحميل نموذج مِشكالة من HuggingFace مثال: model, tokenizer, device = load_mishkala() """ if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' device = torch.device(device) print(f"📥 تحميل مِشكالة من {repo_id}...") tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.json") tokenizer = ArabicTokenizer.load(tokenizer_path) ckpt_path = hf_hub_download(repo_id=repo_id, filename="mishkala.pt") ckpt = torch.load(ckpt_path, map_location=device) model_config = ckpt['config'] model = ArabicDiacritizerModel(**model_config).to(device) model.load_state_dict(ckpt['model_state_dict']) model.eval() params = sum(p.numel() for p in model.parameters()) print(f"✅ النموذج جاهز | Step: {ckpt['step']:,} | DER: {ckpt['der']*100:.2f}%") print(f" {device} | {params:,} معلمة") return model, tokenizer, device # ════════════════════════════════════════════ # 5. دالة التشكيل # ════════════════════════════════════════════ def tashkeel( text: str, model: ArabicDiacritizerModel = None, tokenizer: ArabicTokenizer = None, device: torch.device = None, max_chunk: int = 400, ) -> str: """ شكّل أي نص عربي تلقائياً المعاملات: text : النص العربي المراد تشكيله model : النموذج (يُحمَّل تلقائياً إذا لم يُعطَ) tokenizer : التوكنايزر (يُحمَّل تلقائياً إذا لم يُعطَ) device : الجهاز cuda/cpu max_chunk : الحد الأقصى لطول القطعة الواحدة المخرج: النص مشكّلاً كاملاً مثال: model, tokenizer, device = load_mishkala() result = tashkeel("كان الفيلسوف يرى أن العقل مرآة", model, tokenizer, device) print(result) # كَانَ الْفَيْلَسُوفُ يَرَى أَنَّ الْعَقْلَ مِرْآةٌ """ # تحميل تلقائي إذا لم يُعطَ نموذج global _default_model, _default_tokenizer, _default_device if model is None: if '_default_model' not in globals(): _default_model, _default_tokenizer, _default_device = load_mishkala() model, tokenizer, device = _default_model, _default_tokenizer, _default_device # إزالة التشكيل الموجود clean = ''.join(c for c in text if c not in DIACRITICS_SET) # تقسيم النص على الجمل sentences = re.split(r'([.،؟!\n])', clean) chunks, current = [], "" for part in sentences: if len(current) + len(part) > max_chunk and current: chunks.append(current.strip()) current = part else: current += part if current.strip(): chunks.append(current.strip()) results = [] for chunk in chunks: if not chunk.strip(): results.append(chunk) continue input_ids, attention_mask = tokenizer.encode(chunk, max_length=512, padding=True) ids_t = torch.tensor([input_ids], dtype=torch.long).to(device) mask_t = torch.tensor([attention_mask], dtype=torch.long).to(device) with torch.no_grad(): out = model(ids_t, mask_t) pred_labels = out['predictions'][0] chars = [c for c in chunk if c not in DIACRITICS_SET] result_chars = [] for i, char in enumerate(chars): result_chars.append(char) label_idx = i + 1 if label_idx < len(pred_labels): diacritic = DIACRITIC_MAP.get( DIACRITIC_CLASSES[pred_labels[label_idx]], '' ) result_chars.append(diacritic) results.append(''.join(result_chars)) return ''.join(results) # ════════════════════════════════════════════ # 6. التشغيل المباشر # ════════════════════════════════════════════ if __name__ == "__main__": model, tokenizer, device = load_mishkala() text = "الإنسان بين العقل والغريزة" print(f"\n✨ {tashkeel(text, model, tokenizer, device)}")