piotrkan's picture
add app
8447d06
'''module for NLP related utilities'''
import os
import spacy
def extract_entity_context(doc, entity_text:str, span_length:int) -> str:
"""
Function for extracting context for specific entities
Args:
doc - spacy doc object containing entities
entity_text - entity for which context is sought for
span_length - the context window, specifying no. tokens before & after entity keyword
Outs:
the extracted context for entity in str format
"""
context_tokens=None
start_token=None
end_token=None
#loop through provided entities
for ent in doc.ents:
if ent.text == entity_text:
start_token = max(ent.start - span_length, 0) #index starting context
end_token = min(ent.end + span_length, len(doc)) #index ending context
context_tokens = doc[start_token:end_token]
return context_tokens.text, start_token, end_token
def install_ners(model:str):
"""
function for installing and downloading ner models for scispacy fromn
it's a hacky attempt to get these models installed when having the app
deployed on hugginface spaces (cant install these models from requirements.txt
as docker image cant be build from urls)
Args:
model - ner model from spacy for entity extraction
text - corpus of the pdf
span_length - optional, the context window, specifying no. tokens
before & after entity keyword
Outs:
list of dictionaries containing entity, location and context
"""
if model=='en_ner_craft_md':
os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_craft_md-0.5.4.tar.gz')
elif model=='en_ner_jnlpba_md':
os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz')
elif model == 'en_ner_bc5cdr_md':
os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz')
elif model == 'en_ner_bionlp13cg_md':
os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz')
else:
raise ValueError('no such model')
def extract_entities_with_context(model, text:str, span_length:int=10) -> list:
"""
Function for extracting entities from provides strings
Args:
model - ner model from spacy for entity extraction
text - corpus of the pdf
span_length - optional, the context window, specifying no. tokens before
and after entity keyword
Outs:
list of dictionaries containing entity, location and context
"""
#load the ner model and apply it to the text
try:
nlp = spacy.load(model)
except OSError:
install_ners(model)
nlp = spacy.load(model)
doc = nlp(text)
# extract entities and context
entities_with_context = []
for ent in doc.ents:
context, start, end = extract_entity_context(doc, ent.text, span_length)
entities_with_context.append({
'entity': ent.text, #entity
'start_context':start, #start token
'end_context': end, #end token
#'label': ent.label_, #label, not needed in the requirements to the challenge
'context': context # specified context
})
print(entities_with_context)
return entities_with_context