relation-api

Running

App Files Files Community

alaajabari commited on 27 days ago

Commit

505f3f4

verified ·

1 Parent(s): 46171ae

Update app.py

Browse files

Files changed (1) hide show

app.py +286 -128

app.py CHANGED Viewed

@@ -1,121 +1,128 @@
-from fastapi import FastAPI
-import torch
 import pickle
 from huggingface_hub import hf_hub_download, snapshot_download
 from Nested.nn.BertSeqTagger import BertSeqTagger
-from transformers import AutoTokenizer, AutoModel
-import inspect
-from collections import namedtuple
 from Nested.utils.helpers import load_checkpoint
 from Nested.utils.data import get_dataloaders, text2segments
-import json
-from pydantic import BaseModel
-from fastapi.responses import JSONResponse
 from IBO_to_XML import IBO_to_XML
 from XML_to_HTML import NER_XML_to_HTML
 from NER_Distiller import distill_entities
 app = FastAPI()
-pretrained_path = "aubmindlab/bert-base-arabertv2"  # must match training
 tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
 encoder = AutoModel.from_pretrained(pretrained_path).eval()
-checkpoint_path = snapshot_download(repo_id="SinaLab/Nested", allow_patterns="checkpoints/")
 args_path = hf_hub_download(
     repo_id="SinaLab/Nested",
     filename="args.json"
 )
-with open(args_path, 'r') as f:
     args_data = json.load(f)
-# Load model
 with open("Nested/utils/tag_vocab.pkl", "rb") as f:
     label_vocab = pickle.load(f)
-label_vocab = label_vocab[0]  # the list loaded from pickle
 id2label = {i: s for i, s in enumerate(label_vocab.itos)}
 def split_text_into_groups_of_Ns(sentence, max_words_per_sentence):
-    # Split the text into words
     words = sentence.split()
-    # Initialize variables
     groups = []
     current_group = ""
     group_size = 0
-    # Iterate through the words
     for word in words:
         if group_size < max_words_per_sentence - 1:
-            if len(current_group) == 0:
-                current_group = word
-            else:
-                current_group += " " + word
             group_size += 1
         else:
             current_group += " " + word
             groups.append(current_group)
             current_group = ""
             group_size = 0
-    # Add the last group if it contains less than n words
     if current_group:
         groups.append(current_group)
-    return groups
 def remove_empty_values(sentences):
-    return [value for value in sentences if value != '']
-def sentence_tokenizer(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True):
-    separators = []
-    split_text = [text]
-    if new_line==True:
-        separators.append('\n')
-    if dot==True:
-        separators.append('.')
-    if question_mark==True:
-        separators.append('?')
-        separators.append('؟')
-    if exclamation_mark==True:
-        separators.append('!')
-    for sep in separators:
-        new_split_text = []
-        for part in split_text:
-            tokens = part.split(sep)
-            tokens_with_separator = [token + sep for token in tokens[:-1]]
-            tokens_with_separator.append(tokens[-1].strip())
-            new_split_text.extend(tokens_with_separator)
-        split_text = new_split_text
-    split_text = remove_empty_values(split_text)
     return split_text
 def jsons_to_list_of_lists(json_list):
-    return [[d['token'], d['tags']] for d in json_list]
-tagger, tag_vocab, train_config = load_checkpoint(checkpoint_path)
 def extract(sentence):
     dataset, token_vocab = text2segments(sentence)
-    vocabs = namedtuple("Vocab", ["tags", "tokens"])
-    vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
     dataloader = get_dataloaders(
         (dataset,),
         vocab,
         args_data,
         batch_size=32,
-        shuffle=(False,),
     )[0]
     segments = tagger.infer(dataloader)
@@ -124,95 +131,246 @@ def extract(sentence):
     for segment in segments:
         for token in segment:
-            item = {}
-            item["token"] = token.text
-            list_of_tags = [t["tag"] for t in token.pred_tag]
-            list_of_tags = [i for i in list_of_tags if i not in ("O", " ", "")]
-            if not list_of_tags:
-                item["tags"] = "O"
-            else:
-                item["tags"] = " ".join(list_of_tags)
             lists.append(item)
     return lists
-def NER(sentence, mode):
-    output_list = []
-    xml = ""
-    if mode.strip() == "1":
-        output_list = jsons_to_list_of_lists(extract(sentence))
-        return output_list
-    elif mode.strip() == "2":
-        if output_list != []:
-            xml = IBO_to_XML(output_list)
-            return xml
-        else:
-            output_list = jsons_to_list_of_lists(extract(sentence))
-            xml = IBO_to_XML(output_list)
-            return xml
-    elif mode.strip() == "3":
-        if xml != "":
-            html = NER_XML_to_HTML(xml)
-            return html
-        else:
-            output_list = jsons_to_list_of_lists(extract(sentence))
-            xml = IBO_to_XML(output_list)
-            html = NER_XML_to_HTML(xml)
-            return html
-    elif mode.strip() == "4": # json short
-        if output_list != []:
-            json_short = distill_entities(output_list)
-            return json_short
-        else:
-            output_list = jsons_to_list_of_lists(extract(sentence))
-            json_short = distill_entities(output_list)
-            return json_short
 class NERRequest(BaseModel):
     text: str
-    mode: str
 @app.post("/predict")
 def predict(request: NERRequest):
-    # Load tagger
-    text = request.text
-    mode = request.mode
-    sentences = sentence_tokenizer(
-        text, dot=False, new_line=True, question_mark=False, exclamation_mark=False
-    )
-    lists = []
     for sentence in sentences:
-        se = split_text_into_groups_of_Ns(sentence, max_words_per_sentence=300)
-        for s in se:
-            output_list = NER(s, mode)
-            lists.append(output_list)
-    content = {
-        "resp": lists,
         "statusText": "OK",
-        "statusCode": 0,
-    }
-    return JSONResponse(
-        content=content,
-        media_type="application/json",
-        status_code=200,
-    )
-from fastapi.staticfiles import StaticFiles
-from fastapi.responses import FileResponse
-# mount frontend
-app.mount("/static", StaticFiles(directory="static"), name="static")
-@app.get("/")
-def home():
-    return FileResponse("static/index.html")

+import os
+import json
 import pickle
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from itertools import permutations
+from collections import defaultdict
+from pydantic import BaseModel
+from fastapi import FastAPI
+from fastapi.responses import JSONResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
 from huggingface_hub import hf_hub_download, snapshot_download
+from transformers import (
+    AutoTokenizer,
+    AutoModel,
+    BertModel,
+    PreTrainedTokenizerFast
+)
 from Nested.nn.BertSeqTagger import BertSeqTagger
 from Nested.utils.helpers import load_checkpoint
 from Nested.utils.data import get_dataloaders, text2segments
 from IBO_to_XML import IBO_to_XML
 from XML_to_HTML import NER_XML_to_HTML
 from NER_Distiller import distill_entities
+# =========================
+# App
+# =========================
 app = FastAPI()
+# mount frontend
+app.mount("/static", StaticFiles(directory="static"), name="static")
+@app.get("/")
+def home():
+    return FileResponse("static/index.html")
+# =========================
+# NER MODEL (your working one)
+# =========================
+pretrained_path = "aubmindlab/bert-base-arabertv2"
 tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
 encoder = AutoModel.from_pretrained(pretrained_path).eval()
+checkpoint_path = snapshot_download(
+    repo_id="SinaLab/Nested",
+    allow_patterns="checkpoints/"
+)
 args_path = hf_hub_download(
     repo_id="SinaLab/Nested",
     filename="args.json"
 )
+with open(args_path, "r") as f:
     args_data = json.load(f)
 with open("Nested/utils/tag_vocab.pkl", "rb") as f:
     label_vocab = pickle.load(f)
+label_vocab = label_vocab[0]
 id2label = {i: s for i, s in enumerate(label_vocab.itos)}
+tagger, tag_vocab, train_config = load_checkpoint(checkpoint_path)
+# =========================
+# Helpers (NER)
+# =========================
 def split_text_into_groups_of_Ns(sentence, max_words_per_sentence):
     words = sentence.split()
     groups = []
     current_group = ""
     group_size = 0
     for word in words:
         if group_size < max_words_per_sentence - 1:
+            current_group = word if current_group == "" else current_group + " " + word
             group_size += 1
         else:
             current_group += " " + word
             groups.append(current_group)
             current_group = ""
             group_size = 0
     if current_group:
         groups.append(current_group)
+    return groups
 def remove_empty_values(sentences):
+    return [v for v in sentences if v != ""]
+def sentence_tokenizer(text):
+    split_text = text.split(".")
+    split_text = remove_empty_values(split_text)
     return split_text
 def jsons_to_list_of_lists(json_list):
+    return [[d["token"], d["tags"]] for d in json_list]
 def extract(sentence):
     dataset, token_vocab = text2segments(sentence)
+    vocab = type("Vocab", (), {})(
+        tokens=token_vocab,
+        tags=tag_vocab
+    )
     dataloader = get_dataloaders(
         (dataset,),
         vocab,
         args_data,
         batch_size=32,
+        shuffle=(False,)
     )[0]
     segments = tagger.infer(dataloader)
     for segment in segments:
         for token in segment:
+            item = {"token": token.text}
+            tags = [t["tag"] for t in token.pred_tag]
+            tags = [i for i in tags if i not in ("O", " ", "")]
+            item["tags"] = "O" if not tags else " ".join(tags)
             lists.append(item)
     return lists
+# =========================
+# NER distillation (your logic)
+# =========================
+def distill_entities(entities):
+    list_output = []
+    temp_entities = sortTags(entities)
+    temp_list = [["", "", 0, 0]]
+    word_position = 0
+    for entity in temp_entities:
+        token = entity["token"]
+        tags = entity["tags"].split()
+        counter_tag = 0
+        for tag in tags:
+            if counter_tag >= len(temp_list):
+                temp_list.append(["", "", 0, 0])
+            if tag == "O":
+                pass
+            elif tag.startswith("B-"):
+                temp_list[counter_tag] = [token + " ", tag[2:], word_position, word_position]
+            elif tag.startswith("I-"):
+                for j in range(counter_tag, len(temp_list)):
+                    if temp_list[j][1] == tag[2:]:
+                        temp_list[j][0] += token + " "
+                        temp_list[j][3] = word_position
+                        break
+            counter_tag += 1
+        word_position += 1
+    for j in range(len(temp_list)):
+        if temp_list[j][1] != "":
+            list_output.append(temp_list[j])
+    return list_output
+def sortTags(entities):
+    return entities
+def entities_and_types(sentence):
+    token_tags = extract(sentence)
+    entities = distill_entities(token_tags)
+    entity_dict = {}
+    for name, entity_type, _, _ in entities:
+        entity_dict[name.strip()] = entity_type
+    return entity_dict
+# =========================
+# Relation Model
+# =========================
+repo_id_rel = "aaljabari/arabic-relation-extraction-v1"
+relation_tokenizer = PreTrainedTokenizerFast(
+    tokenizer_file=hf_hub_download(repo_id_rel, "tokenizer.json")
+)
+weights_path = hf_hub_download(repo_id_rel, "pytorch_model.bin")
+with open(hf_hub_download(repo_id_rel, "tag_vocab.pkl"), "rb") as f:
+    vocab = pickle.load(f)
+rel2id = vocab["rel2id"]
+id2rel = vocab["id2rel"]
+class BertRE(nn.Module):
+    def __init__(self, num_labels):
+        super().__init__()
+        self.bert = BertModel.from_pretrained(repo_id_rel)
+        hidden = self.bert.config.hidden_size
+        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
+        self.classifier = nn.Linear(hidden * 2, num_labels)
+    def forward(self, input_ids, attention_mask, sub_pos, obj_pos):
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        hidden = outputs.last_hidden_state
+        batch = hidden.shape[0]
+        sub_vec = hidden[torch.arange(batch), sub_pos]
+        obj_vec = hidden[torch.arange(batch), obj_pos]
+        pair = torch.cat([sub_vec, obj_vec], dim=1)
+        pair = self.dropout(pair)
+        return self.classifier(pair)
+model_re = BertRE(num_labels=len(rel2id))
+model_re.load_state_dict(torch.load(weights_path, map_location="cpu"))
+model_re.eval()
+# =========================
+# Relation utilities
+# =========================
+relation_lookup = defaultdict(lambda: defaultdict(list))
+def insert_markers(sentence, ent1, ent2):
+    if ent1 not in sentence or ent2 not in sentence:
+        return None
+    s = sentence
+    s = s.replace(ent1, f"[Sub] {ent1} [/Sub]", 1)
+    s = s.replace(ent2, f"[Obj] {ent2} [/Obj]", 1)
+    return s
+def encode(sentence):
+    enc = relation_tokenizer(
+        sentence,
+        max_length=128,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt"
+    )
+    input_ids = enc["input_ids"]
+    attention_mask = enc["attention_mask"]
+    sub_id = relation_tokenizer.convert_tokens_to_ids("[Sub]")
+    obj_id = relation_tokenizer.convert_tokens_to_ids("[Obj]")
+    sub_pos = (input_ids == sub_id).nonzero(as_tuple=True)[1]
+    obj_pos = (input_ids == obj_id).nonzero(as_tuple=True)[1]
+    return input_ids, attention_mask, sub_pos, obj_pos
+def predict_relation(sentence):
+    input_ids, mask, sub_pos, obj_pos = encode(sentence)
+    with torch.no_grad():
+        logits = model_re(input_ids, mask, sub_pos, obj_pos)
+    probs = F.softmax(logits, dim=-1)
+    pred = torch.argmax(probs, dim=-1).item()
+    conf = probs[0, pred].item()
+    return id2rel[pred], conf
+def relation_extractor(sentence):
+    entities = entities_and_types(sentence)
+    output = []
+    entity_items = list(entities.items())
+    pairs = [(e1, e2) for e1, e2 in permutations(entity_items, 2)]
+    for (ent1, type1), (ent2, type2) in pairs:
+        marked = insert_markers(sentence, ent1, ent2)
+        if not marked:
+            continue
+        rel, conf = predict_relation(marked)
+        if conf > 0.80 and rel != "no_relation":
+            output.append([ent1, rel, ent2, conf])
+    return output
+# =========================
+# API Models
+# =========================
 class NERRequest(BaseModel):
     text: str
+    mode: str = "1"
+class RERequest(BaseModel):
+    text: str
+# =========================
+# NER endpoint
+# =========================
 @app.post("/predict")
 def predict(request: NERRequest):
+    text = request.text
+    mode = request.mode
+    sentences = sentence_tokenizer(text)
+    results = []
     for sentence in sentences:
+        chunks = split_text_into_groups_of_Ns(sentence, 300)
+        for c in chunks:
+            output_list = jsons_to_list_of_lists(extract(c))
+            results.append(output_list)
+    return JSONResponse({
+        "resp": results,
         "statusText": "OK",
+        "statusCode": 0
+    })
+# =========================
+# Relation endpoint
+# =========================
+@app.post("/predict_re")
+def predict_re(request: RERequest):
+    results = relation_extractor(request.text)
+    return JSONResponse({
+        "resp": results,
+        "statusText": "OK",
+        "statusCode": 0
+    })