import tensorflow as tf
from transformers import DistilBertTokenizerFast, TFDistilBertForTokenClassification, pipeline
from huggingface_hub import snapshot_download
import numpy as np
import joblib
import os


print("Downloading NER model from Hugging Face...")
repo_path = snapshot_download(
    repo_id="samithcs/nlp_ner",
    repo_type="model"
)
print(f"NER model downloaded to: {repo_path}")


NER_MODEL_PATH = os.path.join(repo_path, "nlp_ner", "ner_model")
NER_TOKENIZER_PATH = os.path.join(repo_path, "nlp_ner", "ner_tokenizer")
LABEL2ID_PATH = os.path.join(repo_path, "nlp_ner", "label2id.joblib")


ner_model = TFDistilBertForTokenClassification.from_pretrained(NER_MODEL_PATH)
ner_tokenizer = DistilBertTokenizerFast.from_pretrained(NER_TOKENIZER_PATH)


label2id = joblib.load(LABEL2ID_PATH)
id2label = {i: t for t, i in label2id.items()}


print("Loading Hugging Face NER pipeline...")
hf_ner = pipeline(
    "ner",
    grouped_entities=True,
    model="dbmdz/bert-large-cased-finetuned-conll03-english"
)

print("NER models loaded successfully!")


def extract_entities_pipeline(text: str) -> dict:
    
    tokens = text.split()
    encoding = ner_tokenizer(
        [tokens],
        is_split_into_words=True,
        return_tensors='tf',
        padding='max_length',
        truncation=True,
        max_length=32
    )
    
    outputs = ner_model({k: v for k, v in encoding.items() if k != "labels"})
    pred_ids = np.argmax(outputs.logits.numpy()[0], axis=-1)
    
    entities = {"location": [], "event": []}
    current_loc, current_evt = [], []
    
    for w, id in zip(tokens, pred_ids[:len(tokens)]):
        label = id2label[id]
        
        if label == "B-LOC":
            if current_loc:
                entities["location"].append(" ".join(current_loc))
            current_loc = [w]
        elif label == "I-LOC" and current_loc:
            current_loc.append(w)
        else:
            if current_loc:
                entities["location"].append(" ".join(current_loc))
            current_loc = []
        
        if label == "B-EVENT":
            if current_evt:
                entities["event"].append(" ".join(current_evt))
            current_evt = [w]
        elif label == "I-EVENT" and current_evt:
            current_evt.append(w)
        else:
            if current_evt:
                entities["event"].append(" ".join(current_evt))
            current_evt = []
    

    if current_loc:
        entities["location"].append(" ".join(current_loc))
    if current_evt:
        entities["event"].append(" ".join(current_evt))
    
   
    hf_results = hf_ner(text)
    hf_locations = [ent['word'] for ent in hf_results if ent['entity_group'] == "LOC"]
    
    entities["location"] = list(set(entities["location"]) | set(hf_locations))
    
    return entities