Spaces:
Running
Running
Commit
·
a19fbdf
1
Parent(s):
340b3eb
Remove print statments
Browse files
app.py
CHANGED
|
@@ -16,28 +16,13 @@ from XML_to_HTML import NER_XML_to_HTML
|
|
| 16 |
from NER_Distiller import distill_entities
|
| 17 |
|
| 18 |
app = FastAPI()
|
| 19 |
-
print("Version 2...")
|
| 20 |
-
|
| 21 |
-
# Download files from model repo
|
| 22 |
-
# tag_vocab_path = hf_hub_download(
|
| 23 |
-
# repo_id="SinaLab/Nested",
|
| 24 |
-
# filename="tag_vocab.pkl"
|
| 25 |
-
# )
|
| 26 |
-
|
| 27 |
|
| 28 |
pretrained_path = "aubmindlab/bert-base-arabertv2" # must match training
|
| 29 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
|
| 30 |
encoder = AutoModel.from_pretrained(pretrained_path).eval()
|
| 31 |
|
| 32 |
-
# checkpoint_path = hf_hub_download(
|
| 33 |
-
# repo_id="SinaLab/Nested",
|
| 34 |
-
# filename="checkpoints/checkpoint_2.pt"
|
| 35 |
-
# )
|
| 36 |
-
|
| 37 |
-
|
| 38 |
|
| 39 |
checkpoint_path = snapshot_download(repo_id="SinaLab/Nested", allow_patterns="checkpoints/")
|
| 40 |
-
print("checkpoint_path : ", checkpoint_path)
|
| 41 |
|
| 42 |
args_path = hf_hub_download(
|
| 43 |
repo_id="SinaLab/Nested",
|
|
@@ -47,8 +32,6 @@ args_path = hf_hub_download(
|
|
| 47 |
with open(args_path, 'r') as f:
|
| 48 |
args_data = json.load(f)
|
| 49 |
|
| 50 |
-
print("data : ", args_data)
|
| 51 |
-
|
| 52 |
# Load model
|
| 53 |
with open("Nested/utils/tag_vocab.pkl", "rb") as f:
|
| 54 |
label_vocab = pickle.load(f)
|
|
@@ -156,36 +139,26 @@ def extract(sentence):
|
|
| 156 |
|
| 157 |
|
| 158 |
def NER(sentence, mode):
|
| 159 |
-
print("within NER, and mode is: ", mode)
|
| 160 |
output_list = []
|
| 161 |
xml = ""
|
| 162 |
if mode.strip() == "1":
|
| 163 |
output_list = jsons_to_list_of_lists(extract(sentence))
|
| 164 |
return output_list
|
| 165 |
elif mode.strip() == "2":
|
| 166 |
-
print("outputlist : ", output_list)
|
| 167 |
if output_list != []:
|
| 168 |
xml = IBO_to_XML(output_list)
|
| 169 |
-
print("xml is: ", xml)
|
| 170 |
return xml
|
| 171 |
else:
|
| 172 |
output_list = jsons_to_list_of_lists(extract(sentence))
|
| 173 |
-
print("output_list .....: ", output_list)
|
| 174 |
xml = IBO_to_XML(output_list)
|
| 175 |
-
print("else xml is: ", xml)
|
| 176 |
return xml
|
| 177 |
|
| 178 |
elif mode.strip() == "3":
|
| 179 |
-
print("mode is 3")
|
| 180 |
if xml != "":
|
| 181 |
-
#print("in if")
|
| 182 |
html = NER_XML_to_HTML(xml)
|
| 183 |
return html
|
| 184 |
else:
|
| 185 |
-
print("in else : ")
|
| 186 |
-
print("extract : ", extract(sentence))
|
| 187 |
output_list = jsons_to_list_of_lists(extract(sentence))
|
| 188 |
-
#print("output list : ", output_list)
|
| 189 |
xml = IBO_to_XML(output_list)
|
| 190 |
html = NER_XML_to_HTML(xml)
|
| 191 |
return html
|
|
@@ -228,57 +201,8 @@ def predict(request: NERRequest):
|
|
| 228 |
"statusCode": 0,
|
| 229 |
}
|
| 230 |
|
| 231 |
-
print("content: ", content)
|
| 232 |
return JSONResponse(
|
| 233 |
content=content,
|
| 234 |
media_type="application/json",
|
| 235 |
status_code=200,
|
| 236 |
)
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
# sentence = "ذهب احمد إلى جامعة"
|
| 240 |
-
# Load tagger
|
| 241 |
-
# tagger, tag_vocab, train_config = load_checkpoint(checkpoint_path)
|
| 242 |
-
|
| 243 |
-
# Convert text to a tagger dataset and index the tokens in args.text
|
| 244 |
-
# dataset, token_vocab = text2segments(sentence)
|
| 245 |
-
|
| 246 |
-
# vocabs = namedtuple("Vocab", ["tags", "tokens"])
|
| 247 |
-
# vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
|
| 248 |
-
|
| 249 |
-
# From the datasets generate the dataloaders
|
| 250 |
-
# dataloader = get_dataloaders(
|
| 251 |
-
# (dataset,),
|
| 252 |
-
# vocab,
|
| 253 |
-
# args_data,
|
| 254 |
-
# batch_size=32,
|
| 255 |
-
# shuffle=(False,),
|
| 256 |
-
# )[0]
|
| 257 |
-
|
| 258 |
-
# Perform inference on the text and get back the tagged segments
|
| 259 |
-
# segments = tagger.infer(dataloader)
|
| 260 |
-
# segments_lists = []
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
## Print results
|
| 264 |
-
## for segment in segments:
|
| 265 |
-
## s = [
|
| 266 |
-
## f"{token.text} ({'|'.join([t['tag'] for t in token.pred_tag])})"
|
| 267 |
-
## for token in segment
|
| 268 |
-
## ]
|
| 269 |
-
## print(" ".join(s))
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
# for segment in segments:
|
| 273 |
-
# for token in segment:
|
| 274 |
-
# segments_list = {}
|
| 275 |
-
# segments_list["token"] = token.text
|
| 276 |
-
# list_of_tags = [t['tag'] for t in token.pred_tag]
|
| 277 |
-
# list_of_tags = [i for i in list_of_tags if i not in('O',' ','')]
|
| 278 |
-
# if list_of_tags == []:
|
| 279 |
-
# segments_list["tags"] = ' '.join(['O'])
|
| 280 |
-
# else:
|
| 281 |
-
# segments_list["tags"] = ' '.join(list_of_tags)
|
| 282 |
-
# segments_lists.append(segments_list)
|
| 283 |
-
|
| 284 |
-
# print(segments_lists)
|
|
|
|
| 16 |
from NER_Distiller import distill_entities
|
| 17 |
|
| 18 |
app = FastAPI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
pretrained_path = "aubmindlab/bert-base-arabertv2" # must match training
|
| 21 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
|
| 22 |
encoder = AutoModel.from_pretrained(pretrained_path).eval()
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
checkpoint_path = snapshot_download(repo_id="SinaLab/Nested", allow_patterns="checkpoints/")
|
|
|
|
| 26 |
|
| 27 |
args_path = hf_hub_download(
|
| 28 |
repo_id="SinaLab/Nested",
|
|
|
|
| 32 |
with open(args_path, 'r') as f:
|
| 33 |
args_data = json.load(f)
|
| 34 |
|
|
|
|
|
|
|
| 35 |
# Load model
|
| 36 |
with open("Nested/utils/tag_vocab.pkl", "rb") as f:
|
| 37 |
label_vocab = pickle.load(f)
|
|
|
|
| 139 |
|
| 140 |
|
| 141 |
def NER(sentence, mode):
|
|
|
|
| 142 |
output_list = []
|
| 143 |
xml = ""
|
| 144 |
if mode.strip() == "1":
|
| 145 |
output_list = jsons_to_list_of_lists(extract(sentence))
|
| 146 |
return output_list
|
| 147 |
elif mode.strip() == "2":
|
|
|
|
| 148 |
if output_list != []:
|
| 149 |
xml = IBO_to_XML(output_list)
|
|
|
|
| 150 |
return xml
|
| 151 |
else:
|
| 152 |
output_list = jsons_to_list_of_lists(extract(sentence))
|
|
|
|
| 153 |
xml = IBO_to_XML(output_list)
|
|
|
|
| 154 |
return xml
|
| 155 |
|
| 156 |
elif mode.strip() == "3":
|
|
|
|
| 157 |
if xml != "":
|
|
|
|
| 158 |
html = NER_XML_to_HTML(xml)
|
| 159 |
return html
|
| 160 |
else:
|
|
|
|
|
|
|
| 161 |
output_list = jsons_to_list_of_lists(extract(sentence))
|
|
|
|
| 162 |
xml = IBO_to_XML(output_list)
|
| 163 |
html = NER_XML_to_HTML(xml)
|
| 164 |
return html
|
|
|
|
| 201 |
"statusCode": 0,
|
| 202 |
}
|
| 203 |
|
|
|
|
| 204 |
return JSONResponse(
|
| 205 |
content=content,
|
| 206 |
media_type="application/json",
|
| 207 |
status_code=200,
|
| 208 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|