last_edit / src /arabic_tools.py
Moharek
Deploy Moharek GEO Platform
a74b879
import re
from typing import List, Dict
ARABIC_CHAR_RE = re.compile(r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]")
def is_arabic_text(text: str) -> bool:
"""Return True if text contains Arabic characters."""
if not text:
return False
return bool(ARABIC_CHAR_RE.search(text))
def extract_entities_arabic(text: str) -> Dict:
"""Attempt to extract entities using CAMeL Tools if available; otherwise basic heuristics.
Returns a dict similar to spaCy output: { 'entities': [ { 'text':..., 'label':... }, ... ], 'summary': {...} }
"""
try:
from camel_tools.ner import NERecognizer
ner = NERecognizer.pretrained()
tokens = text.split()
tags = ner.predict_sentence(tokens)
ents = []
cur = None
for tok, tag in zip(tokens, tags):
if tag != 'O':
if cur is None:
cur = {'text': tok, 'label': tag}
else:
cur['text'] += ' ' + tok
else:
if cur:
ents.append(cur)
cur = None
if cur:
ents.append(cur)
summary = {}
for e in ents:
summary[e['label']] = summary.get(e['label'], 0) + 1
return {'entities': ents, 'summary': summary}
except Exception:
# fallback heuristics: phone numbers, percent, numbers, short location heuristics
ents = []
phones = re.findall(r"\+?\d[\d\s\-]{6,}\d", text)
for p in phones:
ents.append({'text': p, 'label': 'PHONE'})
# crude location capture: look for 'Cairo' or Arabic equivalents
if 'القاهرة' in text or 'Cairo' in text:
ents.append({'text': 'Cairo', 'label': 'GPE'})
summary = {}
for e in ents:
summary[e['label']] = summary.get(e['label'], 0) + 1
return {'entities': ents, 'summary': summary}
def arabert_embedding_stub(text: str):
"""If `transformers` and AraBERT model available, return embedding vector; else None.
This is a safe stub that will not fail if libraries are missing.
"""
try:
from transformers import AutoTokenizer, AutoModel
import torch
model_name = 'aubmindlab/bert-base-arabertv02'
tok = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
inputs = tok(text, return_tensors='pt', truncation=True, max_length=512)
with torch.no_grad():
out = model(**inputs)
# mean pooling
vec = out.last_hidden_state.mean(dim=1).squeeze().tolist()
return vec
except Exception:
return None