from transformers import pipeline import gradio as gr import re # Load NER pipeline ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple") # --- Safe Sentence Splitter --- def split_sentences(text): """ Splits text into sentences while protecting abbreviations like 'U.S.' or 'Dr.'. Avoids variable-length lookbehind errors by using placeholder replacement. """ # Common abbreviations to protect protected_terms = [ "Mr.", "Mrs.", "Ms.", "Dr.", "Jr.", "Sr.", "U.S.", "D.C.", "vs.", "Lt.", "St.", "Prof.", "Inc.", "Ltd.", "etc." ] # Temporarily replace periods in abbreviations to avoid splitting protected_map = {term: term.replace(".", "") for term in protected_terms} for original, safe in protected_map.items(): text = text.replace(original, safe) # Split on ., ?, ! followed by space + capital/lowercase/apostrophe pattern = re.compile(r"(?<=[.!?])\s+(?=[A-Z'‘“a-z])") parts = re.split(pattern, text.strip()) # Restore abbreviations restored = [] for s in parts: for original, safe in protected_map.items(): s = s.replace(safe, original) restored.append(s.strip()) return [s for s in restored if s] # --- API Function --- def analyze_text(text): sentences = split_sentences(text) results = [] for i, sentence in enumerate(sentences, start=1): entities = ner(sentence) results.append({ "sentence_number": i, "sentence": sentence, "entities": entities }) return {"sentences": results, "total_sentences": len(sentences)} # --- Gradio Interface (API Style) --- demo = gr.Interface( fn=analyze_text, inputs=gr.Textbox(label="Input Text", lines=6, placeholder="Paste your article..."), outputs=gr.JSON(label="NER + Sentence Output"), title="Sentence Splitter + NER API", description="Splits text into sentences (protects abbreviations) and runs Named Entity Recognition (dslim/bert-base-NER)." ) if __name__ == "__main__": demo.launch()