Spaces:

MNGames
/

NLP

Sleeping

App Files Files Community

NLP / app.py

MNGames

Update app.py

3d6f597 verified 3 months ago

raw

history blame contribute delete

2.1 kB

	from transformers import pipeline
	import gradio as gr
	import re

	# Load NER pipeline
	ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

	# --- Safe Sentence Splitter ---
	def split_sentences(text):
	"""
	Splits text into sentences while protecting abbreviations like 'U.S.' or 'Dr.'.
	Avoids variable-length lookbehind errors by using placeholder replacement.
	"""
	# Common abbreviations to protect
	protected_terms = [
	"Mr.", "Mrs.", "Ms.", "Dr.", "Jr.", "Sr.",
	"U.S.", "D.C.", "vs.", "Lt.", "St.", "Prof.", "Inc.", "Ltd.", "etc."
	]

	# Temporarily replace periods in abbreviations to avoid splitting
	protected_map = {term: term.replace(".", "<DOT>") for term in protected_terms}
	for original, safe in protected_map.items():
	text = text.replace(original, safe)

	# Split on ., ?, ! followed by space + capital/lowercase/apostrophe
	pattern = re.compile(r"(?<=[.!?])\s+(?=[A-Z'‘“a-z])")
	parts = re.split(pattern, text.strip())

	# Restore abbreviations
	restored = []
	for s in parts:
	for original, safe in protected_map.items():
	s = s.replace(safe, original)
	restored.append(s.strip())

	return [s for s in restored if s]

	# --- API Function ---
	def analyze_text(text):
	sentences = split_sentences(text)
	results = []

	for i, sentence in enumerate(sentences, start=1):
	entities = ner(sentence)
	results.append({
	"sentence_number": i,
	"sentence": sentence,
	"entities": entities
	})

	return {"sentences": results, "total_sentences": len(sentences)}

	# --- Gradio Interface (API Style) ---
	demo = gr.Interface(
	fn=analyze_text,
	inputs=gr.Textbox(label="Input Text", lines=6, placeholder="Paste your article..."),
	outputs=gr.JSON(label="NER + Sentence Output"),
	title="Sentence Splitter + NER API",
	description="Splits text into sentences (protects abbreviations) and runs Named Entity Recognition (dslim/bert-base-NER)."
	)

	if __name__ == "__main__":
	demo.launch()