Spaces:

Alinabil1
/

last_edit

Sleeping

last_edit / src /arabic_tools.py

Moharek

Deploy Moharek GEO Platform

a74b879 about 1 month ago

2.69 kB

	import re
	from typing import List, Dict

	ARABIC_CHAR_RE = re.compile(r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]")


	def is_arabic_text(text: str) -> bool:
	"""Return True if text contains Arabic characters."""
	if not text:
	return False
	return bool(ARABIC_CHAR_RE.search(text))


	def extract_entities_arabic(text: str) -> Dict:
	"""Attempt to extract entities using CAMeL Tools if available; otherwise basic heuristics.

	Returns a dict similar to spaCy output: { 'entities': [ { 'text':..., 'label':... }, ... ], 'summary': {...} }
	"""
	try:
	from camel_tools.ner import NERecognizer
	ner = NERecognizer.pretrained()
	tokens = text.split()
	tags = ner.predict_sentence(tokens)
	ents = []
	cur = None
	for tok, tag in zip(tokens, tags):
	if tag != 'O':
	if cur is None:
	cur = {'text': tok, 'label': tag}
	else:
	cur['text'] += ' ' + tok
	else:
	if cur:
	ents.append(cur)
	cur = None
	if cur:
	ents.append(cur)
	summary = {}
	for e in ents:
	summary[e['label']] = summary.get(e['label'], 0) + 1
	return {'entities': ents, 'summary': summary}
	except Exception:
	# fallback heuristics: phone numbers, percent, numbers, short location heuristics
	ents = []
	phones = re.findall(r"\+?\d[\d\s\-]{6,}\d", text)
	for p in phones:
	ents.append({'text': p, 'label': 'PHONE'})
	# crude location capture: look for 'Cairo' or Arabic equivalents
	if 'القاهرة' in text or 'Cairo' in text:
	ents.append({'text': 'Cairo', 'label': 'GPE'})
	summary = {}
	for e in ents:
	summary[e['label']] = summary.get(e['label'], 0) + 1
	return {'entities': ents, 'summary': summary}


	def arabert_embedding_stub(text: str):
	"""If `transformers` and AraBERT model available, return embedding vector; else None.
	This is a safe stub that will not fail if libraries are missing.
	"""
	try:
	from transformers import AutoTokenizer, AutoModel
	import torch
	model_name = 'aubmindlab/bert-base-arabertv02'
	tok = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)
	inputs = tok(text, return_tensors='pt', truncation=True, max_length=512)
	with torch.no_grad():
	out = model(**inputs)
	# mean pooling
	vec = out.last_hidden_state.mean(dim=1).squeeze().tolist()
	return vec
	except Exception:
	return None