File size: 2,814 Bytes
1eaee2c
 
1339523
a647fb1
1eaee2c
1339523
1eaee2c
 
1339523
 
 
 
 
 
1eaee2c
 
1339523
 
 
 
 
 
 
1eaee2c
 
1339523
83d35cd
1eaee2c
 
1339523
83d35cd
 
 
 
 
a647fb1
1339523
 
a647fb1
1eaee2c
1339523
1eaee2c
83d35cd
a647fb1
 
 
 
 
83d35cd
a647fb1
1339523
83d35cd
 
1339523
83d35cd
1eaee2c
1339523
1eaee2c
 
1339523
1eaee2c
 
83d35cd
1eaee2c
 
 
 
 
83d35cd
1339523
 
1eaee2c
 
83d35cd
1eaee2c
 
 
 
 
83d35cd
1339523
 
a647fb1
1eaee2c
83d35cd
1eaee2c
83d35cd
1339523
 
1eaee2c
 
1339523
83d35cd
1339523
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import tensorflow as tf
from transformers import DistilBertTokenizerFast, TFDistilBertForTokenClassification, pipeline
from huggingface_hub import snapshot_download
import numpy as np
import joblib
import os


print("Downloading NER model from Hugging Face...")
repo_path = snapshot_download(
    repo_id="samithcs/nlp_ner",
    repo_type="model"
)
print(f"NER model downloaded to: {repo_path}")


NER_MODEL_PATH = os.path.join(repo_path, "nlp_ner", "ner_model")
NER_TOKENIZER_PATH = os.path.join(repo_path, "nlp_ner", "ner_tokenizer")
LABEL2ID_PATH = os.path.join(repo_path, "nlp_ner", "label2id.joblib")


ner_model = TFDistilBertForTokenClassification.from_pretrained(NER_MODEL_PATH)
ner_tokenizer = DistilBertTokenizerFast.from_pretrained(NER_TOKENIZER_PATH)


label2id = joblib.load(LABEL2ID_PATH)
id2label = {i: t for t, i in label2id.items()}


print("Loading Hugging Face NER pipeline...")
hf_ner = pipeline(
    "ner",
    grouped_entities=True,
    model="dbmdz/bert-large-cased-finetuned-conll03-english"
)

print("NER models loaded successfully!")


def extract_entities_pipeline(text: str) -> dict:
    
    tokens = text.split()
    encoding = ner_tokenizer(
        [tokens],
        is_split_into_words=True,
        return_tensors='tf',
        padding='max_length',
        truncation=True,
        max_length=32
    )
    
    outputs = ner_model({k: v for k, v in encoding.items() if k != "labels"})
    pred_ids = np.argmax(outputs.logits.numpy()[0], axis=-1)
    
    entities = {"location": [], "event": []}
    current_loc, current_evt = [], []
    
    for w, id in zip(tokens, pred_ids[:len(tokens)]):
        label = id2label[id]
        
        if label == "B-LOC":
            if current_loc:
                entities["location"].append(" ".join(current_loc))
            current_loc = [w]
        elif label == "I-LOC" and current_loc:
            current_loc.append(w)
        else:
            if current_loc:
                entities["location"].append(" ".join(current_loc))
            current_loc = []
        
        if label == "B-EVENT":
            if current_evt:
                entities["event"].append(" ".join(current_evt))
            current_evt = [w]
        elif label == "I-EVENT" and current_evt:
            current_evt.append(w)
        else:
            if current_evt:
                entities["event"].append(" ".join(current_evt))
            current_evt = []
    

    if current_loc:
        entities["location"].append(" ".join(current_loc))
    if current_evt:
        entities["event"].append(" ".join(current_evt))
    
   
    hf_results = hf_ner(text)
    hf_locations = [ent['word'] for ent in hf_results if ent['entity_group'] == "LOC"]
    
    entities["location"] = list(set(entities["location"]) | set(hf_locations))
    
    return entities