mishkala / model.py

Upload model.py

4215f17 2 days ago

15.8 kB

	"""
	================================================
	Arabic Diacritization - mishkala
	نموذج التشكيل العربي التلقائي
	https://huggingface.co/flokymind/mishkala
	================================================
	المتطلبات:
	pip install torch pytorch-crf huggingface_hub
	================================================
	"""

	# ── المتطلبات ─────────────────────────────────
	import subprocess, sys

	def install(pkg):
	subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])

	try:
	import torchcrf
	except ImportError:
	install("pytorch-crf")

	try:
	from huggingface_hub import hf_hub_download
	except ImportError:
	install("huggingface_hub")

	# ── الاستيرادات ───────────────────────────────
	import json, math, re
	from pathlib import Path
	from typing import Dict

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torchcrf import CRF
	from huggingface_hub import hf_hub_download


	# ════════════════════════════════════════════
	# 1. الثوابت
	# ════════════════════════════════════════════

	REPO_ID = "flokymind/mishkala"

	DIACRITICS_SET = {
	'\u064e', '\u064b', '\u064f', '\u064c',
	'\u0650', '\u064d', '\u0651', '\u0652',
	}

	SPECIAL_TOKENS = {'PAD': 0, 'UNK': 1, 'BOS': 2, 'EOS': 3, 'MASK': 4, ' ': 5}

	DIACRITIC_CLASSES = [
	'NO_DIACRITIC', 'FATHA', 'FATHATAN', 'DAMMA', 'DAMMATAN',
	'KASRA', 'KASRATAN', 'SUKUN', 'SHADDA',
	'SHADDA_FATHA', 'SHADDA_FATHATAN', 'SHADDA_DAMMA',
	'SHADDA_DAMMATAN', 'SHADDA_KASRA', 'SHADDA_KASRATAN',
	]

	DIACRITIC_MAP = {
	'NO_DIACRITIC': '',
	'FATHA': '\u064e',
	'FATHATAN': '\u064b',
	'DAMMA': '\u064f',
	'DAMMATAN': '\u064c',
	'KASRA': '\u0650',
	'KASRATAN': '\u064d',
	'SUKUN': '\u0652',
	'SHADDA': '\u0651',
	'SHADDA_FATHA': '\u0651\u064e',
	'SHADDA_FATHATAN': '\u0651\u064b',
	'SHADDA_DAMMA': '\u0651\u064f',
	'SHADDA_DAMMATAN': '\u0651\u064c',
	'SHADDA_KASRA': '\u0651\u0650',
	'SHADDA_KASRATAN': '\u0651\u064d',
	}


	# ════════════════════════════════════════════
	# 2. التوكنايزر
	# ════════════════════════════════════════════

	class ArabicTokenizer:
	def __init__(self):
	self.char_to_id: Dict[str, int] = {}
	self.id_to_char: Dict[int, str] = {}
	self.vocab_size: int = 0

	def encode(self, text, max_length=512, padding=True):
	ids = [SPECIAL_TOKENS['BOS']]
	for ch in text:
	if ch in DIACRITICS_SET:
	continue
	ids.append(self.char_to_id.get(ch, SPECIAL_TOKENS['UNK']))
	ids.append(SPECIAL_TOKENS['EOS'])

	attention_mask = [1] * len(ids)

	if len(ids) > max_length:
	ids = ids[:max_length]
	attention_mask = attention_mask[:max_length]
	elif padding:
	pad_len = max_length - len(ids)
	ids += [SPECIAL_TOKENS['PAD']] * pad_len
	attention_mask += [0] * pad_len

	return ids, attention_mask

	@classmethod
	def load(cls, path):
	data = json.loads(Path(path).read_text(encoding='utf-8'))
	tok = cls()
	tok.char_to_id = data['char_to_id']
	tok.id_to_char = {int(v): k for k, v in data['char_to_id'].items()}
	tok.vocab_size = data['vocab_size']
	print(f"✅ التوكنايزر: {tok.vocab_size} رمز")
	return tok


	# ════════════════════════════════════════════
	# 3. مكونات النموذج
	# ════════════════════════════════════════════

	class RMSNorm(nn.Module):
	def __init__(self, dim, eps=1e-6):
	super().__init__()
	self.eps = eps
	self.scale = nn.Parameter(torch.ones(dim))

	def forward(self, x):
	rms = x.pow(2).mean(-1, keepdim=True).add(self.eps).sqrt()
	return self.scale * x / rms


	class RotaryEmbedding(nn.Module):
	def __init__(self, dim, max_seq_len=4096):
	super().__init__()
	inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
	self.register_buffer('inv_freq', inv_freq)
	t = torch.arange(max_seq_len).float()
	freqs = torch.outer(t, inv_freq)
	emb = torch.cat([freqs, freqs], dim=-1)
	self.register_buffer('cos_cached', emb.cos())
	self.register_buffer('sin_cached', emb.sin())

	def forward(self, x, seq_len):
	return (
	self.cos_cached[:seq_len].unsqueeze(0),
	self.sin_cached[:seq_len].unsqueeze(0),
	)


	def rotate_half(x):
	x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
	return torch.cat([-x2, x1], dim=-1)


	def apply_rope(q, k, cos, sin):
	return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)


	class SwiGLU(nn.Module):
	def __init__(self, dim, expansion=4):
	super().__init__()
	hidden = int(dim * expansion * 2 / 3)
	hidden = (hidden + 7) // 8 * 8
	self.gate_proj = nn.Linear(dim, hidden, bias=False)
	self.up_proj = nn.Linear(dim, hidden, bias=False)
	self.down_proj = nn.Linear(hidden, dim, bias=False)

	def forward(self, x):
	return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))


	class MambaBlock(nn.Module):
	def __init__(self, dim, d_state=16, d_conv=4, expand=2):
	super().__init__()
	self.d_inner = int(dim * expand)
	self.in_proj = nn.Linear(dim, self.d_inner * 2, bias=False)
	self.conv1d = nn.Conv1d(self.d_inner, self.d_inner, d_conv,
	padding=d_conv-1, groups=self.d_inner, bias=True)
	self.out_proj = nn.Linear(self.d_inner, dim, bias=False)
	self.x_proj = nn.Linear(self.d_inner, d_state * 2 + 1, bias=False)
	self.dt_proj = nn.Linear(1, self.d_inner, bias=True)
	A = torch.arange(1, d_state+1).float().unsqueeze(0).expand(self.d_inner, -1)
	self.A_log = nn.Parameter(torch.log(A))
	self.D = nn.Parameter(torch.ones(self.d_inner))
	self.norm = RMSNorm(dim)

	def ssm(self, x):
	dt = F.softplus(self.dt_proj(self.x_proj(x)[..., :1]))
	return x * self.D + torch.cumsum(x * dt, dim=1) * 0.1

	def forward(self, x):
	residual = x
	x = self.norm(x)
	xz = self.in_proj(x)
	x_ssm, z = xz.chunk(2, dim=-1)
	x_conv = self.conv1d(x_ssm.transpose(1,2))[..., :x_ssm.shape[1]].transpose(1,2)
	y = self.ssm(F.silu(x_conv)) * F.silu(z)
	return self.out_proj(y) + residual


	class TransformerBlock(nn.Module):
	def __init__(self, dim, n_heads, max_len=4096, dropout=0.1):
	super().__init__()
	self.n_heads = n_heads
	self.head_dim = dim // n_heads
	self.q_proj = nn.Linear(dim, dim, bias=False)
	self.k_proj = nn.Linear(dim, dim, bias=False)
	self.v_proj = nn.Linear(dim, dim, bias=False)
	self.o_proj = nn.Linear(dim, dim, bias=False)
	self.rope = RotaryEmbedding(self.head_dim, max_len)
	self.ffn = SwiGLU(dim)
	self.norm1 = RMSNorm(dim)
	self.norm2 = RMSNorm(dim)
	self.dropout = nn.Dropout(dropout)

	def attention(self, x, mask=None):
	B, L, D = x.shape
	q = self.q_proj(x).view(B,L,self.n_heads,self.head_dim).transpose(1,2)
	k = self.k_proj(x).view(B,L,self.n_heads,self.head_dim).transpose(1,2)
	v = self.v_proj(x).view(B,L,self.n_heads,self.head_dim).transpose(1,2)
	cos, sin = self.rope(x, L)
	cos = cos.unsqueeze(1).expand_as(q)
	sin = sin.unsqueeze(1).expand_as(q)
	q, k = apply_rope(q, k, cos, sin)
	scores = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(self.head_dim)
	if mask is not None:
	scores = scores.masked_fill(
	~mask.unsqueeze(1).unsqueeze(2).bool(), float('-inf')
	)
	attn = self.dropout(F.softmax(scores, dim=-1))
	out = torch.matmul(attn, v).transpose(1,2).contiguous().view(B,L,D)
	return self.o_proj(out)

	def forward(self, x, mask=None):
	x = x + self.dropout(self.attention(self.norm1(x), mask))
	x = x + self.dropout(self.ffn(self.norm2(x)))
	return x


	class ArabicDiacritizerModel(nn.Module):
	def __init__(self, vocab_size=50, dim=320, mamba_layers=4,
	transformer_layers=8, n_heads=8, num_labels=15,
	max_seq_len=4096, dropout=0.15, d_state=16):
	super().__init__()
	self.num_labels = num_labels
	self.embedding = nn.Embedding(vocab_size, dim, padding_idx=0)
	self.emb_norm = RMSNorm(dim)
	self.dropout = nn.Dropout(dropout)
	self.mamba_layers = nn.ModuleList([
	MambaBlock(dim, d_state) for _ in range(mamba_layers)
	])
	self.transformer_layers = nn.ModuleList([
	TransformerBlock(dim, n_heads, max_seq_len, dropout)
	for _ in range(transformer_layers)
	])
	self.final_norm = RMSNorm(dim)
	self.classifier = nn.Linear(dim, num_labels)
	self.crf = CRF(num_labels, batch_first=True)

	def forward(self, input_ids, attention_mask=None, labels=None):
	x = self.dropout(self.emb_norm(self.embedding(input_ids)))
	for m in self.mamba_layers:
	x = m(x)
	for t in self.transformer_layers:
	x = t(x, attention_mask)
	emissions = self.classifier(self.final_norm(x))
	mask = (attention_mask.bool() if attention_mask is not None
	else torch.ones(emissions.shape[:2],
	dtype=torch.bool, device=emissions.device))
	return {
	'predictions': self.crf.decode(emissions, mask=mask),
	'emissions': emissions,
	}


	# ════════════════════════════════════════════
	# 4. تحميل النموذج من HuggingFace
	# ════════════════════════════════════════════

	def load_mishkala(repo_id: str = REPO_ID, device: str = None):
	"""
	تحميل نموذج مِشكالة من HuggingFace

	مثال:
	model, tokenizer, device = load_mishkala()
	"""
	if device is None:
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	device = torch.device(device)

	print(f"📥 تحميل مِشكالة من {repo_id}...")

	tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.json")
	tokenizer = ArabicTokenizer.load(tokenizer_path)

	ckpt_path = hf_hub_download(repo_id=repo_id, filename="mishkala.pt")
	ckpt = torch.load(ckpt_path, map_location=device)
	model_config = ckpt['config']
	model = ArabicDiacritizerModel(**model_config).to(device)
	model.load_state_dict(ckpt['model_state_dict'])
	model.eval()

	params = sum(p.numel() for p in model.parameters())
	print(f"✅ النموذج جاهز \| Step: {ckpt['step']:,} \| DER: {ckpt['der']*100:.2f}%")
	print(f" {device} \| {params:,} معلمة")

	return model, tokenizer, device


	# ════════════════════════════════════════════
	# 5. دالة التشكيل
	# ════════════════════════════════════════════

	def tashkeel(
	text: str,
	model: ArabicDiacritizerModel = None,
	tokenizer: ArabicTokenizer = None,
	device: torch.device = None,
	max_chunk: int = 400,
	) -> str:
	"""
	شكّل أي نص عربي تلقائياً

	المعاملات:
	text : النص العربي المراد تشكيله
	model : النموذج (يُحمَّل تلقائياً إذا لم يُعطَ)
	tokenizer : التوكنايزر (يُحمَّل تلقائياً إذا لم يُعطَ)
	device : الجهاز cuda/cpu
	max_chunk : الحد الأقصى لطول القطعة الواحدة

	المخرج:
	النص مشكّلاً كاملاً

	مثال:
	model, tokenizer, device = load_mishkala()
	result = tashkeel("كان الفيلسوف يرى أن العقل مرآة", model, tokenizer, device)
	print(result)
	# كَانَ الْفَيْلَسُوفُ يَرَى أَنَّ الْعَقْلَ مِرْآةٌ
	"""
	# تحميل تلقائي إذا لم يُعطَ نموذج
	global _default_model, _default_tokenizer, _default_device
	if model is None:
	if '_default_model' not in globals():
	_default_model, _default_tokenizer, _default_device = load_mishkala()
	model, tokenizer, device = _default_model, _default_tokenizer, _default_device

	# إزالة التشكيل الموجود
	clean = ''.join(c for c in text if c not in DIACRITICS_SET)

	# تقسيم النص على الجمل
	sentences = re.split(r'([.،؟!\n])', clean)
	chunks, current = [], ""
	for part in sentences:
	if len(current) + len(part) > max_chunk and current:
	chunks.append(current.strip())
	current = part
	else:
	current += part
	if current.strip():
	chunks.append(current.strip())

	results = []
	for chunk in chunks:
	if not chunk.strip():
	results.append(chunk)
	continue

	input_ids, attention_mask = tokenizer.encode(chunk, max_length=512, padding=True)
	ids_t = torch.tensor([input_ids], dtype=torch.long).to(device)
	mask_t = torch.tensor([attention_mask], dtype=torch.long).to(device)

	with torch.no_grad():
	out = model(ids_t, mask_t)

	pred_labels = out['predictions'][0]
	chars = [c for c in chunk if c not in DIACRITICS_SET]
	result_chars = []

	for i, char in enumerate(chars):
	result_chars.append(char)
	label_idx = i + 1
	if label_idx < len(pred_labels):
	diacritic = DIACRITIC_MAP.get(
	DIACRITIC_CLASSES[pred_labels[label_idx]], ''
	)
	result_chars.append(diacritic)

	results.append(''.join(result_chars))

	return ''.join(results)


	# ════════════════════════════════════════════
	# 6. التشغيل المباشر
	# ════════════════════════════════════════════

	if __name__ == "__main__":
	model, tokenizer, device = load_mishkala()

	text = "الإنسان بين العقل والغريزة"
	print(f"\n✨ {tashkeel(text, model, tokenizer, device)}")