import tensorflow as tf from transformers import DistilBertTokenizerFast, TFDistilBertForTokenClassification, pipeline from huggingface_hub import snapshot_download import numpy as np import joblib import os print("Downloading NER model from Hugging Face...") repo_path = snapshot_download( repo_id="samithcs/nlp_ner", repo_type="model" ) print(f"NER model downloaded to: {repo_path}") NER_MODEL_PATH = os.path.join(repo_path, "nlp_ner", "ner_model") NER_TOKENIZER_PATH = os.path.join(repo_path, "nlp_ner", "ner_tokenizer") LABEL2ID_PATH = os.path.join(repo_path, "nlp_ner", "label2id.joblib") ner_model = TFDistilBertForTokenClassification.from_pretrained(NER_MODEL_PATH) ner_tokenizer = DistilBertTokenizerFast.from_pretrained(NER_TOKENIZER_PATH) label2id = joblib.load(LABEL2ID_PATH) id2label = {i: t for t, i in label2id.items()} print("Loading Hugging Face NER pipeline...") hf_ner = pipeline( "ner", grouped_entities=True, model="dbmdz/bert-large-cased-finetuned-conll03-english" ) print("NER models loaded successfully!") def extract_entities_pipeline(text: str) -> dict: tokens = text.split() encoding = ner_tokenizer( [tokens], is_split_into_words=True, return_tensors='tf', padding='max_length', truncation=True, max_length=32 ) outputs = ner_model({k: v for k, v in encoding.items() if k != "labels"}) pred_ids = np.argmax(outputs.logits.numpy()[0], axis=-1) entities = {"location": [], "event": []} current_loc, current_evt = [], [] for w, id in zip(tokens, pred_ids[:len(tokens)]): label = id2label[id] if label == "B-LOC": if current_loc: entities["location"].append(" ".join(current_loc)) current_loc = [w] elif label == "I-LOC" and current_loc: current_loc.append(w) else: if current_loc: entities["location"].append(" ".join(current_loc)) current_loc = [] if label == "B-EVENT": if current_evt: entities["event"].append(" ".join(current_evt)) current_evt = [w] elif label == "I-EVENT" and current_evt: current_evt.append(w) else: if current_evt: entities["event"].append(" ".join(current_evt)) current_evt = [] if current_loc: entities["location"].append(" ".join(current_loc)) if current_evt: entities["event"].append(" ".join(current_evt)) hf_results = hf_ner(text) hf_locations = [ent['word'] for ent in hf_results if ent['entity_group'] == "LOC"] entities["location"] = list(set(entities["location"]) | set(hf_locations)) return entities