"""
NER Engine — Named Entity Recognition using HuggingFace Transformers.
Wraps the Davlan/bert-base-multilingual-cased-ner-hrl model.
"""

from typing import List
from .models import EntityResult


HF_MODEL_ID = "Nomio4640/ner-mongolian"


class NEREngine:
    """Named Entity Recognition service using HuggingFace pipeline."""

    def __init__(self, model_name: str = None):
        import os
        # Use local model if it exists, otherwise fall back to HuggingFace Hub
        local_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "adapters", "ner_mongolian")
        if model_name:
            self.model_name = model_name
        elif os.path.exists(os.path.join(local_path, "model.safetensors")):
            self.model_name = local_path
        else:
            self.model_name = HF_MODEL_ID
        self._pipeline = None

    def _load_pipeline(self):
        """Lazy-load the NER pipeline (heavy model, load only when needed)."""
        if self._pipeline is None:
            import torch
            from transformers import pipeline
            device = 0 if torch.cuda.is_available() else -1
            self._pipeline = pipeline(
                "ner",
                model=self.model_name,
                aggregation_strategy="simple",
                device=device,
            )
            print(f"[NEREngine] Loaded on {'GPU' if device == 0 else 'CPU'}")
        return self._pipeline

    def _clean_entities(self, raw_entities: List[dict]) -> List[dict]:
        """Merge subword tokens (## prefixed) back together."""
        cleaned = []
        for ent in raw_entities:
            word = ent.get("word", "")
            if word.startswith("##") and len(cleaned) > 0:
                cleaned[-1]["word"] += word.replace("##", "")
            else:
                cleaned.append(dict(ent))
        return cleaned

    def recognize(self, text: str) -> List[EntityResult]:
        """Run NER on a single text and return cleaned entities."""
        if not text or not text.strip():
            return []
        pipe = self._load_pipeline()
        try:
            raw = pipe(text)
        except Exception:
            return []

        cleaned = self._clean_entities(raw)
        results = []
        for ent in cleaned:
            results.append(EntityResult(
                word=ent.get("word", ""),
                entity_group=ent.get("entity_group", "MISC"),
                score=float(ent.get("score", 0.0)),
                start=int(ent.get("start", 0)),
                end=int(ent.get("end", 0)),
            ))
        return results

    def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
        """Run NER on a batch of texts utilizing Hugging Face pipeline batching."""
        if not texts:
            return []
        
        # Filter empty texts to avoid pipeline errors
        valid_texts = []
        valid_indices = []
        for i, text in enumerate(texts):
            if text and text.strip():
                valid_texts.append(text)
                valid_indices.append(i)
                
        # Preallocate empty results for all texts
        out: List[List[EntityResult]] = [[] for _ in texts]
        
        if not valid_texts:
            return out
            
        pipe = self._load_pipeline()
        try:
            # Send batch directly to pipeline
            raw_results = pipe(valid_texts, batch_size=batch_size)
            
            for idx, raw in zip(valid_indices, raw_results):
                cleaned = self._clean_entities(raw)
                entity_results = []
                for ent in cleaned:
                    entity_results.append(EntityResult(
                        word=ent.get("word", ""),
                        entity_group=ent.get("entity_group", "MISC"),
                        score=float(ent.get("score", 0.0)),
                        start=int(ent.get("start", 0)),
                        end=int(ent.get("end", 0)),
                    ))
                out[idx] = entity_results
        except Exception as e:
            print(f"[NEREngine] Batch processing error: {e}")
            # Fallback to single text processing if pipeline batch fails
            for idx, text in zip(valid_indices, valid_texts):
                out[idx] = self.recognize(text)
                
        return out