'''module for NLP related utilities''' import os import spacy def extract_entity_context(doc, entity_text:str, span_length:int) -> str: """ Function for extracting context for specific entities Args: doc - spacy doc object containing entities entity_text - entity for which context is sought for span_length - the context window, specifying no. tokens before & after entity keyword Outs: the extracted context for entity in str format """ context_tokens=None start_token=None end_token=None #loop through provided entities for ent in doc.ents: if ent.text == entity_text: start_token = max(ent.start - span_length, 0) #index starting context end_token = min(ent.end + span_length, len(doc)) #index ending context context_tokens = doc[start_token:end_token] return context_tokens.text, start_token, end_token def install_ners(model:str): """ function for installing and downloading ner models for scispacy fromn it's a hacky attempt to get these models installed when having the app deployed on hugginface spaces (cant install these models from requirements.txt as docker image cant be build from urls) Args: model - ner model from spacy for entity extraction text - corpus of the pdf span_length - optional, the context window, specifying no. tokens before & after entity keyword Outs: list of dictionaries containing entity, location and context """ if model=='en_ner_craft_md': os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_craft_md-0.5.4.tar.gz') elif model=='en_ner_jnlpba_md': os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz') elif model == 'en_ner_bc5cdr_md': os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz') elif model == 'en_ner_bionlp13cg_md': os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz') else: raise ValueError('no such model') def extract_entities_with_context(model, text:str, span_length:int=10) -> list: """ Function for extracting entities from provides strings Args: model - ner model from spacy for entity extraction text - corpus of the pdf span_length - optional, the context window, specifying no. tokens before and after entity keyword Outs: list of dictionaries containing entity, location and context """ #load the ner model and apply it to the text try: nlp = spacy.load(model) except OSError: install_ners(model) nlp = spacy.load(model) doc = nlp(text) # extract entities and context entities_with_context = [] for ent in doc.ents: context, start, end = extract_entity_context(doc, ent.text, span_length) entities_with_context.append({ 'entity': ent.text, #entity 'start_context':start, #start token 'end_context': end, #end token #'label': ent.label_, #label, not needed in the requirements to the challenge 'context': context # specified context }) print(entities_with_context) return entities_with_context