File size: 3,467 Bytes
8447d06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
'''module for NLP related utilities'''
import os
import spacy

def extract_entity_context(doc, entity_text:str, span_length:int)  -> str:
    """
    	Function for extracting context for specific entities
    Args:
    	doc - spacy doc object containing entities
        entity_text - entity for which context is sought for
        span_length - the context window, specifying no. tokens before & after entity keyword
	Outs:
	    the extracted context for entity in str format
    """
    context_tokens=None
    start_token=None
    end_token=None
    #loop through provided entities
    for ent in doc.ents:
        if ent.text == entity_text:
            start_token = max(ent.start - span_length, 0) #index starting context
            end_token = min(ent.end + span_length, len(doc)) #index ending context
            context_tokens = doc[start_token:end_token]
    return context_tokens.text, start_token, end_token

def install_ners(model:str):
    """
    	function for installing and downloading ner models for scispacy fromn
        it's a hacky attempt to get these models installed when having the app
        deployed on hugginface spaces (cant install these models from requirements.txt
        as docker image cant be build from urls)
    Args:
    	model - ner model from spacy for entity extraction
        text - corpus of the pdf 
        span_length - optional, the context window, specifying no. tokens
        before & after entity keyword
	Outs:
	    list of dictionaries containing entity, location and context
    """
    if model=='en_ner_craft_md':
        os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_craft_md-0.5.4.tar.gz')
    elif model=='en_ner_jnlpba_md':
        os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz')
    elif model == 'en_ner_bc5cdr_md':
        os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz')
    elif model == 'en_ner_bionlp13cg_md':
        os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz')
    else:
        raise ValueError('no such model')


def extract_entities_with_context(model, text:str, span_length:int=10) -> list:
    """
    	Function for extracting entities from provides strings
    Args:
    	model - ner model from spacy for entity extraction
        text - corpus of the pdf 
        span_length - optional, the context window, specifying no. tokens before
        and after entity keyword
	Outs:
	    list of dictionaries containing entity, location and context
    """
    #load the ner model and apply it to the text
    try:
        nlp = spacy.load(model)
    except OSError:
        install_ners(model)
        nlp = spacy.load(model)
    doc = nlp(text)

    # extract entities and context
    entities_with_context = []
    for ent in doc.ents:
        context, start, end = extract_entity_context(doc, ent.text, span_length)
        entities_with_context.append({
            'entity': ent.text, #entity
            'start_context':start, #start token 
            'end_context': end, #end token
            #'label': ent.label_,    #label, not needed in the requirements to the challenge
            'context': context # specified context
        })
    print(entities_with_context)
    return entities_with_context