Spaces:

IFMedTech
/

ner

Build error

App Files Files Community

ner / clinical_ner.py

IFMedTechdemo

Create clinical_ner.py

7617d00 verified 2 months ago

raw

history blame contribute delete

7.36 kB

	from transformers import pipeline
	import spacy

	class ClinicalNERProcessor:
	"""
	A class for Named Entity Recognition and POS tagging.
	"""

	def __init__(self, use_pos=True, use_anatomy=True):
	# Clinical NER pipeline
	self.ner_pipeline = pipeline(
	"ner",
	model="samrawal/bert-base-uncased_clinical-ner",
	aggregation_strategy="simple"
	)

	# Anatomy NER pipeline
	# Available models (choose based on your needs):
	# - OpenMed/OpenMed-NER-AnatomyDetect-BioPatient-108M (smallest, fastest)
	# - OpenMed/OpenMed-NER-AnatomyDetect-ModernClinical-149M (balanced)
	# - OpenMed/OpenMed-NER-AnatomyDetect-ElectraMed-560M (most accurate)
	self.anatomy_pipeline = None
	if use_anatomy:
	try:
	self.anatomy_pipeline = pipeline(
	"ner",
	model="OpenMed/OpenMed-NER-AnatomyDetect-BioPatient-108M",
	aggregation_strategy="simple"
	)
	except Exception as e:
	print(f"Warning: Could not load anatomy model: {e}")

	# Load spaCy model for POS tagging
	self.nlp = None
	if use_pos:
	try:
	self.nlp = spacy.load("en_core_web_sm")
	except OSError:
	print("Warning: spaCy model 'en_core_web_sm' not found.")
	print("Install it with: python -m spacy download en_core_web_sm")

	def _merge_subwords(self, entities):
	if not entities:
	return []

	merged = []
	i = 0

	while i < len(entities):
	current = entities[i].copy()
	word = current['word']
	end = current['end']

	# Look ahead for subword tokens (starting with ##)
	j = i + 1
	while j < len(entities):
	next_entity = entities[j]

	# Check if it's a subword of the same entity type
	if (next_entity['word'].startswith('##') and
	next_entity['entity_group'] == current['entity_group']):
	# Remove ## prefix and append
	word += next_entity['word'][2:]
	end = next_entity['end']
	j += 1
	else:
	break

	# Update the merged entity
	current['word'] = word
	current['end'] = end
	merged.append(current)

	# Skip the merged tokens
	i = j

	return merged

	def basic_ner(self, text):
	"""Clinical NER only"""
	entities = self.ner_pipeline(text)
	return self._merge_subwords(entities)

	def prolog_ner(self, text):
	"""Clinical NER as Prolog facts"""
	entities = self.ner_pipeline(text)
	merged_entities = self._merge_subwords(entities)

	prolog_facts = []
	for i, entity in enumerate(merged_entities):
	# Escape single quotes in words for Prolog
	word = entity['word'].replace("'", "\\'")

	# Format: entity(Id, Type, Word, Start, End, Score)
	fact = (
	f"entity({i}, '{entity['entity_group']}', "
	f"'{word}', {entity['start']}, "
	f"{entity['end']}, {entity['score']:.4f})."
	)
	prolog_facts.append(fact)

	return "\n".join(prolog_facts)

	def anatomy_ner(self, text):
	"""Anatomy NER only"""
	if self.anatomy_pipeline is None:
	raise RuntimeError("Anatomy NER pipeline not initialized.")

	entities = self.anatomy_pipeline(text)
	return self._merge_subwords(entities)

	def prolog_anatomy(self, text):
	"""Anatomy NER as Prolog facts"""
	if self.anatomy_pipeline is None:
	raise RuntimeError("Anatomy NER pipeline not initialized.")

	entities = self.anatomy_pipeline(text)
	merged_entities = self._merge_subwords(entities)

	prolog_facts = []
	for i, entity in enumerate(merged_entities):
	# Escape single quotes in words for Prolog
	word = entity['word'].replace("'", "\\'")

	# Format: anatomy(Id, Type, Word, Start, End, Score)
	fact = (
	f"anatomy({i}, '{entity['entity_group']}', "
	f"'{word}', {entity['start']}, "
	f"{entity['end']}, {entity['score']:.4f})."
	)
	prolog_facts.append(fact)

	return "\n".join(prolog_facts)

	def pos_tagging(self, text):
	"""POS tagging only"""
	if self.nlp is None:
	raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm")

	doc = self.nlp(text)

	pos_results = []
	for token in doc:
	pos_results.append({
	'token': token.text,
	'lemma': token.lemma_,
	'pos': token.pos_, # Universal POS tag
	'tag': token.tag_, # Fine-grained POS tag
	'dep': token.dep_, # Dependency relation
	'start': token.idx,
	'end': token.idx + len(token.text)
	})

	return pos_results

	def prolog_pos(self, text):
	"""POS tagging as Prolog facts"""
	if self.nlp is None:
	raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm")

	pos_results = self.pos_tagging(text)

	prolog_facts = []
	for i, token_info in enumerate(pos_results):
	# Escape single quotes in tokens for Prolog
	token = token_info['token'].replace("'", "\\'")
	lemma = token_info['lemma'].replace("'", "\\'")

	# Format: pos(Id, Token, Lemma, POS, Tag, Dep, Start, End)
	fact = (
	f"pos({i}, '{token}', '{lemma}', '{token_info['pos']}', "
	f"'{token_info['tag']}', '{token_info['dep']}', "
	f"{token_info['start']}, {token_info['end']})."
	)
	prolog_facts.append(fact)

	return "\n".join(prolog_facts)

	def combined_analysis(self, text):
	"""Combined analysis: Clinical NER + Anatomy NER + POS tagging"""
	result = {
	'clinical_entities': self.basic_ner(text),
	'anatomy_entities': [],
	'pos_tags': []
	}

	if self.anatomy_pipeline:
	result['anatomy_entities'] = self.anatomy_ner(text)

	if self.nlp:
	result['pos_tags'] = self.pos_tagging(text)

	return result

	def prolog_combined(self, text):
	"""Combined Prolog output: Clinical NER + Anatomy NER + POS tagging"""
	sections = []

	# Clinical NER
	clinical_facts = self.prolog_ner(text)
	if clinical_facts:
	sections.append(f"% Clinical Entities\n{clinical_facts}")

	# Anatomy NER
	if self.anatomy_pipeline:
	anatomy_facts = self.prolog_anatomy(text)
	if anatomy_facts:
	sections.append(f"% Anatomy Entities\n{anatomy_facts}")

	# POS tagging
	if self.nlp:
	pos_facts = self.prolog_pos(text)
	if pos_facts:
	sections.append(f"% POS Tags\n{pos_facts}")

	return "\n\n".join(sections)