relation-api-v1

Runtime error

App Files Files Community

alaajabari commited on 19 days ago

Commit

56abf08

verified ·

1 Parent(s): c8d7c34

Create ner.py

Browse files

Files changed (1) hide show

ner.py +106 -0

ner.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# ner_engine.py
+import json
+import pickle
+from collections import namedtuple
+from huggingface_hub import hf_hub_download, snapshot_download
+from Nested.utils.helpers import load_checkpoint
+from Nested.utils.data import get_dataloaders, text2segments
+from NER_Distiller import distill_entities
+# =============================
+# Load model ONCE (important)
+# =============================
+checkpoint_path = snapshot_download(
+    repo_id="SinaLab/Nested",
+    allow_patterns="checkpoints/"
+)
+args_path = hf_hub_download(
+    repo_id="SinaLab/Nested",
+    filename="args.json"
+)
+with open(args_path, "r") as f:
+    args_data = json.load(f)
+# load vocab
+with open("Nested/utils/tag_vocab.pkl", "rb") as f:
+    label_vocab = pickle.load(f)
+label_vocab = label_vocab[0]
+# =============================
+# Load tagger ONCE
+# =============================
+tagger, tag_vocab, train_config = load_checkpoint(checkpoint_path)
+# =============================
+# Core NER extraction (your logic preserved)
+# =============================
+def extract(sentence: str):
+    dataset, token_vocab = text2segments(sentence)
+    vocabs = namedtuple("Vocab", ["tags", "tokens"])
+    vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
+    dataloader = get_dataloaders(
+        (dataset,),
+        vocab,
+        args_data,
+        batch_size=32,
+        shuffle=(False,),
+    )[0]
+    segments = tagger.infer(dataloader)
+    lists = []
+    for segment in segments:
+        for token in segment:
+            tags = [t["tag"] for t in token.pred_tag]
+            tags = [t for t in tags if t not in ("O", " ", "")]
+            lists.append({
+                "token": token.text,
+                "tags": " ".join(tags) if tags else "O"
+            })
+    return lists
+# =============================
+# convert format for distiller
+# =============================
+def _to_list_of_lists(json_list):
+    return [[d["token"], d["tags"]] for d in json_list]
+# =============================
+# FINAL FUNCTION USED BY RE
+# =============================
+def entities_and_types(sentence: str):
+    """
+    Returns:
+        dict: {entity_text: entity_type}
+    """
+    ner_output = extract(sentence)
+    converted = _to_list_of_lists(ner_output)
+    entities = distill_entities(converted)
+    entity_dict = {}
+    for item in entities:
+        # item format: [text, type, start, end]
+        if len(item) >= 2:
+            entity_text = item[0].strip()
+            entity_type = item[1]
+            entity_dict[entity_text] = entity_type
+    return entity_dict