Spaces:
Sleeping
Sleeping
| """ | |
| NER Engine — Named Entity Recognition using HuggingFace Transformers. | |
| Wraps the Davlan/bert-base-multilingual-cased-ner-hrl model. | |
| """ | |
| from typing import List | |
| from .models import EntityResult | |
| HF_MODEL_ID = "Nomio4640/ner-mongolian" | |
| class NEREngine: | |
| """Named Entity Recognition service using HuggingFace pipeline.""" | |
| def __init__(self, model_name: str = None): | |
| import os | |
| # Use local model if it exists, otherwise fall back to HuggingFace Hub | |
| local_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "adapters", "ner_mongolian") | |
| if model_name: | |
| self.model_name = model_name | |
| elif os.path.exists(os.path.join(local_path, "model.safetensors")): | |
| self.model_name = local_path | |
| else: | |
| self.model_name = HF_MODEL_ID | |
| self._pipeline = None | |
| def _load_pipeline(self): | |
| """Lazy-load the NER pipeline (heavy model, load only when needed).""" | |
| if self._pipeline is None: | |
| import torch | |
| from transformers import pipeline | |
| device = 0 if torch.cuda.is_available() else -1 | |
| self._pipeline = pipeline( | |
| "ner", | |
| model=self.model_name, | |
| aggregation_strategy="simple", | |
| device=device, | |
| ) | |
| print(f"[NEREngine] Loaded on {'GPU' if device == 0 else 'CPU'}") | |
| return self._pipeline | |
| def _clean_entities(self, raw_entities: List[dict]) -> List[dict]: | |
| """Merge subword tokens (## prefixed) back together.""" | |
| cleaned = [] | |
| for ent in raw_entities: | |
| word = ent.get("word", "") | |
| if word.startswith("##") and len(cleaned) > 0: | |
| cleaned[-1]["word"] += word.replace("##", "") | |
| else: | |
| cleaned.append(dict(ent)) | |
| return cleaned | |
| def recognize(self, text: str) -> List[EntityResult]: | |
| """Run NER on a single text and return cleaned entities.""" | |
| if not text or not text.strip(): | |
| return [] | |
| pipe = self._load_pipeline() | |
| try: | |
| raw = pipe(text) | |
| except Exception: | |
| return [] | |
| cleaned = self._clean_entities(raw) | |
| results = [] | |
| for ent in cleaned: | |
| results.append(EntityResult( | |
| word=ent.get("word", ""), | |
| entity_group=ent.get("entity_group", "MISC"), | |
| score=float(ent.get("score", 0.0)), | |
| start=int(ent.get("start", 0)), | |
| end=int(ent.get("end", 0)), | |
| )) | |
| return results | |
| def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]: | |
| """Run NER on a batch of texts utilizing Hugging Face pipeline batching.""" | |
| if not texts: | |
| return [] | |
| # Filter empty texts to avoid pipeline errors | |
| valid_texts = [] | |
| valid_indices = [] | |
| for i, text in enumerate(texts): | |
| if text and text.strip(): | |
| valid_texts.append(text) | |
| valid_indices.append(i) | |
| # Preallocate empty results for all texts | |
| out: List[List[EntityResult]] = [[] for _ in texts] | |
| if not valid_texts: | |
| return out | |
| pipe = self._load_pipeline() | |
| try: | |
| # Send batch directly to pipeline | |
| raw_results = pipe(valid_texts, batch_size=batch_size) | |
| for idx, raw in zip(valid_indices, raw_results): | |
| cleaned = self._clean_entities(raw) | |
| entity_results = [] | |
| for ent in cleaned: | |
| entity_results.append(EntityResult( | |
| word=ent.get("word", ""), | |
| entity_group=ent.get("entity_group", "MISC"), | |
| score=float(ent.get("score", 0.0)), | |
| start=int(ent.get("start", 0)), | |
| end=int(ent.get("end", 0)), | |
| )) | |
| out[idx] = entity_results | |
| except Exception as e: | |
| print(f"[NEREngine] Batch processing error: {e}") | |
| # Fallback to single text processing if pipeline batch fails | |
| for idx, text in zip(valid_indices, valid_texts): | |
| out[idx] = self.recognize(text) | |
| return out | |