Spaces:

piotrkan
/

scipdf_extract_app

Runtime error

File size: 3,467 Bytes

8447d06

'''module for NLP related utilities'''
import os
import spacy

def extract_entity_context(doc, entity_text:str, span_length:int)  -> str:
    """
    	Function for extracting context for specific entities
    Args:
    	doc - spacy doc object containing entities
        entity_text - entity for which context is sought for
        span_length - the context window, specifying no. tokens before & after entity keyword
	Outs:
	    the extracted context for entity in str format
    """
    context_tokens=None
    start_token=None
    end_token=None
    #loop through provided entities
    for ent in doc.ents:
        if ent.text == entity_text:
            start_token = max(ent.start - span_length, 0) #index starting context
            end_token = min(ent.end + span_length, len(doc)) #index ending context
            context_tokens = doc[start_token:end_token]
    return context_tokens.text, start_token, end_token

def install_ners(model:str):
    """
    	function for installing and downloading ner models for scispacy fromn
        it's a hacky attempt to get these models installed when having the app
        deployed on hugginface spaces (cant install these models from requirements.txt
        as docker image cant be build from urls)
    Args:
    	model - ner model from spacy for entity extraction
        text - corpus of the pdf 
        span_length - optional, the context window, specifying no. tokens
        before & after entity keyword
	Outs:
	    list of dictionaries containing entity, location and context
    """
    if model=='en_ner_craft_md':
        os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_craft_md-0.5.4.tar.gz')
    elif model=='en_ner_jnlpba_md':
        os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz')
    elif model == 'en_ner_bc5cdr_md':
        os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz')
    elif model == 'en_ner_bionlp13cg_md':
        os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz')
    else:
        raise ValueError('no such model')


def extract_entities_with_context(model, text:str, span_length:int=10) -> list:
    """
    	Function for extracting entities from provides strings
    Args:
    	model - ner model from spacy for entity extraction
        text - corpus of the pdf 
        span_length - optional, the context window, specifying no. tokens before
        and after entity keyword
	Outs:
	    list of dictionaries containing entity, location and context
    """
    #load the ner model and apply it to the text
    try:
        nlp = spacy.load(model)
    except OSError:
        install_ners(model)
        nlp = spacy.load(model)
    doc = nlp(text)

    # extract entities and context
    entities_with_context = []
    for ent in doc.ents:
        context, start, end = extract_entity_context(doc, ent.text, span_length)
        entities_with_context.append({
            'entity': ent.text, #entity
            'start_context':start, #start token 
            'end_context': end, #end token
            #'label': ent.label_,    #label, not needed in the requirements to the challenge
            'context': context # specified context
        })
    print(entities_with_context)
    return entities_with_context