| import re |
| from typing import List, Dict |
|
|
| ARABIC_CHAR_RE = re.compile(r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]") |
|
|
|
|
| def is_arabic_text(text: str) -> bool: |
| """Return True if text contains Arabic characters.""" |
| if not text: |
| return False |
| return bool(ARABIC_CHAR_RE.search(text)) |
|
|
|
|
| def extract_entities_arabic(text: str) -> Dict: |
| """Attempt to extract entities using CAMeL Tools if available; otherwise basic heuristics. |
| |
| Returns a dict similar to spaCy output: { 'entities': [ { 'text':..., 'label':... }, ... ], 'summary': {...} } |
| """ |
| try: |
| from camel_tools.ner import NERecognizer |
| ner = NERecognizer.pretrained() |
| tokens = text.split() |
| tags = ner.predict_sentence(tokens) |
| ents = [] |
| cur = None |
| for tok, tag in zip(tokens, tags): |
| if tag != 'O': |
| if cur is None: |
| cur = {'text': tok, 'label': tag} |
| else: |
| cur['text'] += ' ' + tok |
| else: |
| if cur: |
| ents.append(cur) |
| cur = None |
| if cur: |
| ents.append(cur) |
| summary = {} |
| for e in ents: |
| summary[e['label']] = summary.get(e['label'], 0) + 1 |
| return {'entities': ents, 'summary': summary} |
| except Exception: |
| |
| ents = [] |
| phones = re.findall(r"\+?\d[\d\s\-]{6,}\d", text) |
| for p in phones: |
| ents.append({'text': p, 'label': 'PHONE'}) |
| |
| if 'القاهرة' in text or 'Cairo' in text: |
| ents.append({'text': 'Cairo', 'label': 'GPE'}) |
| summary = {} |
| for e in ents: |
| summary[e['label']] = summary.get(e['label'], 0) + 1 |
| return {'entities': ents, 'summary': summary} |
|
|
|
|
| def arabert_embedding_stub(text: str): |
| """If `transformers` and AraBERT model available, return embedding vector; else None. |
| This is a safe stub that will not fail if libraries are missing. |
| """ |
| try: |
| from transformers import AutoTokenizer, AutoModel |
| import torch |
| model_name = 'aubmindlab/bert-base-arabertv02' |
| tok = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModel.from_pretrained(model_name) |
| inputs = tok(text, return_tensors='pt', truncation=True, max_length=512) |
| with torch.no_grad(): |
| out = model(**inputs) |
| |
| vec = out.last_hidden_state.mean(dim=1).squeeze().tolist() |
| return vec |
| except Exception: |
| return None |
|
|