Spaces:

piotrkan
/

scipdf_extract_app

Runtime error

App Files Files Community

scipdf_extract_app / src /model.py

piotrkan

add app

8447d06 over 1 year ago

raw

history blame contribute delete

3.47 kB

	'''module for NLP related utilities'''
	import os
	import spacy

	def extract_entity_context(doc, entity_text:str, span_length:int) -> str:
	"""
	Function for extracting context for specific entities
	Args:
	doc - spacy doc object containing entities
	entity_text - entity for which context is sought for
	span_length - the context window, specifying no. tokens before & after entity keyword
	Outs:
	the extracted context for entity in str format
	"""
	context_tokens=None
	start_token=None
	end_token=None
	#loop through provided entities
	for ent in doc.ents:
	if ent.text == entity_text:
	start_token = max(ent.start - span_length, 0) #index starting context
	end_token = min(ent.end + span_length, len(doc)) #index ending context
	context_tokens = doc[start_token:end_token]
	return context_tokens.text, start_token, end_token

	def install_ners(model:str):
	"""
	function for installing and downloading ner models for scispacy fromn
	it's a hacky attempt to get these models installed when having the app
	deployed on hugginface spaces (cant install these models from requirements.txt
	as docker image cant be build from urls)
	Args:
	model - ner model from spacy for entity extraction
	text - corpus of the pdf
	span_length - optional, the context window, specifying no. tokens
	before & after entity keyword
	Outs:
	list of dictionaries containing entity, location and context
	"""
	if model=='en_ner_craft_md':
	os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_craft_md-0.5.4.tar.gz')
	elif model=='en_ner_jnlpba_md':
	os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz')
	elif model == 'en_ner_bc5cdr_md':
	os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz')
	elif model == 'en_ner_bionlp13cg_md':
	os.system('pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz')
	else:
	raise ValueError('no such model')


	def extract_entities_with_context(model, text:str, span_length:int=10) -> list:
	"""
	Function for extracting entities from provides strings
	Args:
	model - ner model from spacy for entity extraction
	text - corpus of the pdf
	span_length - optional, the context window, specifying no. tokens before
	and after entity keyword
	Outs:
	list of dictionaries containing entity, location and context
	"""
	#load the ner model and apply it to the text
	try:
	nlp = spacy.load(model)
	except OSError:
	install_ners(model)
	nlp = spacy.load(model)
	doc = nlp(text)

	# extract entities and context
	entities_with_context = []
	for ent in doc.ents:
	context, start, end = extract_entity_context(doc, ent.text, span_length)
	entities_with_context.append({
	'entity': ent.text, #entity
	'start_context':start, #start token
	'end_context': end, #end token
	#'label': ent.label_, #label, not needed in the requirements to the challenge
	'context': context # specified context
	})
	print(entities_with_context)
	return entities_with_context