Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| # --------------------------------------------------- | |
| # Models | |
| # --------------------------------------------------- | |
| MODEL_NAMES = [ | |
| "c-ho/2026-04-24-crf-classweights-clean", | |
| "c-ho/2026-04-23-crf-classweights-clean", | |
| ] | |
| EXAMPLE_TEXT = ( | |
| "As a result, Indo-European developed a minimal vowel system " | |
| "combined with a very large consonant inventory including " | |
| "glottalized stops, also grammatical gender and adjectival agreement." | |
| ) | |
| # --------------------------------------------------- | |
| # Lazy model cache | |
| # --------------------------------------------------- | |
| model_cache = {} | |
| def get_model(model_name): | |
| if model_name not in model_cache: | |
| model_cache[model_name] = pipeline( | |
| "ner", | |
| model=model_name, | |
| aggregation_strategy="simple" | |
| ) | |
| return model_cache[model_name] | |
| # --------------------------------------------------- | |
| # Model info | |
| # --------------------------------------------------- | |
| model_info = { | |
| m: { | |
| "link": f"https://huggingface.co/{m}", | |
| "usage": f'''from transformers import pipeline | |
| ner = pipeline( | |
| "ner", | |
| model="{m}", | |
| aggregation_strategy="simple" | |
| ) | |
| result = ner("Hello world") | |
| print(result) | |
| ''' | |
| } | |
| for m in MODEL_NAMES | |
| } | |
| # --------------------------------------------------- | |
| # UI helper | |
| # --------------------------------------------------- | |
| def display_model_info(model_name): | |
| info = model_info[model_name] | |
| return ( | |
| info["usage"], | |
| f"[Open model page]({info['link']})" | |
| ) | |
| # --------------------------------------------------- | |
| # Merge subwords into full spans | |
| # --------------------------------------------------- | |
| def merge_subwords(results): | |
| merged = [] | |
| current = None | |
| for token in results: | |
| word = token.get("word", "") | |
| label = token.get( | |
| "entity_group", | |
| token.get("entity", "UNK") | |
| ) | |
| score = token.get("score", 0.0) | |
| start = token.get("start", 0) | |
| end = token.get("end", 0) | |
| # Continuation token | |
| if word.startswith("##") and current is not None: | |
| current["word"] += word[2:] | |
| current["end"] = end | |
| current["score"] = max(current["score"], score) | |
| else: | |
| # flush previous | |
| if current is not None: | |
| merged.append(current) | |
| current = { | |
| "word": word, | |
| "start": start, | |
| "end": end, | |
| "entity_group": label, | |
| "score": score | |
| } | |
| if current is not None: | |
| merged.append(current) | |
| return merged | |
| # --------------------------------------------------- | |
| # Main inference function | |
| # --------------------------------------------------- | |
| def analyze_text(text, model_name): | |
| ner = get_model(model_name) | |
| results = ner(text) | |
| # merge subwords first | |
| results = merge_subwords(results) | |
| highlighted_text = [] | |
| last_idx = 0 | |
| table_rows = [] | |
| for ent in results: | |
| start = ent["start"] | |
| end = ent["end"] | |
| label = ent["entity_group"] | |
| # Add normal text before entity | |
| if start > last_idx: | |
| highlighted_text.append( | |
| (text[last_idx:start], None) | |
| ) | |
| # Add highlighted entity | |
| highlighted_text.append( | |
| (text[start:end], label) | |
| ) | |
| last_idx = end | |
| table_rows.append([ | |
| ent["word"], | |
| label, | |
| round(ent["score"], 3) | |
| ]) | |
| # Add remaining text | |
| if last_idx < len(text): | |
| highlighted_text.append( | |
| (text[last_idx:], None) | |
| ) | |
| return highlighted_text, table_rows | |
| # --------------------------------------------------- | |
| # Entity colors | |
| # --------------------------------------------------- | |
| COLOR_MAP = { | |
| # ----------------------------------- | |
| # Academic / theoretical | |
| # ----------------------------------- | |
| "AcademicDiscipline": "#5339a8", # intense purple | |
| "AmbiguouslyDefinedConcept": "#ab8fbd", # muted purple | |
| "UnclassifiedLinguisticConcept": "#d4a1c7", # soft gray-pink | |
| # ----------------------------------- | |
| # Language / general linguistic | |
| # ----------------------------------- | |
| "LanguageRelatedTerm": "#E9C46A", # warm sand yellow | |
| "OtherLinguisticTerm": "#A8DADC", # pale cyan | |
| "LanguageResourceInformation": "#457B9D", # medium blue | |
| # ----------------------------------- | |
| # Phonology / graphemics | |
| # ----------------------------------- | |
| "PhonologicalPhenomenon": "#E76F51", # coral red | |
| "GraphemicPhenomenon": "#F4A261", # orange | |
| # ----------------------------------- | |
| # Morphology / syntax | |
| # ----------------------------------- | |
| "MorphologicalPhenomenon": "#37bdac", # turquoise green | |
| "MorphosyntacticPhenomenon": "#43916d", # medium green | |
| "SyntacticPhenomenon": "#53703a", # darker moss | |
| # ----------------------------------- | |
| # Lexicon / semantics / discourse | |
| # ----------------------------------- | |
| "LexicalPhenomenon": "#577590", # slate blue | |
| "SemanticPhenomenon": "#4361EE", # vivid blue | |
| "DiscoursePhenomenon": "#B5179E", # magenta-purple | |
| # ----------------------------------- | |
| # Special / misc | |
| # ----------------------------------- | |
| "NEW_TAG": "#FF006E", # neon pink | |
| "TOPNODE_DUMMY": "#BDBDBD", # neutral gray | |
| # Outside tag | |
| "O": "#FFFFFF" | |
| } | |
| # --------------------------------------------------- | |
| # UI | |
| # --------------------------------------------------- | |
| with gr.Blocks(title="Linguistic Annotation Demo") as demo: | |
| gr.Markdown( | |
| """ | |
| # Linguistic Annotation Demo | |
| This Space demonstrates custom linguistic sequence tagging models | |
| for detecting linguistic terminology and phenomena with concepts from an ontology based on the Bibliography of Linguistic Literature (BLL). | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| model_selector = gr.Dropdown( | |
| choices=MODEL_NAMES, | |
| value=MODEL_NAMES[0], | |
| label="Select Model" | |
| ) | |
| text_input = gr.Textbox( | |
| label="Input Text", | |
| lines=8, | |
| value=EXAMPLE_TEXT | |
| ) | |
| run_button = gr.Button("Run Annotation") | |
| with gr.Column(scale=1): | |
| code_output = gr.Code( | |
| label="Transformers Usage" | |
| ) | |
| link_output = gr.Markdown() | |
| highlighted_output = gr.HighlightedText( | |
| label="Annotated Text", | |
| combine_adjacent=True, | |
| color_map=COLOR_MAP, | |
| show_legend=True, | |
| elem_id="ner-highlight" | |
| ) | |
| entity_table = gr.Dataframe( | |
| headers=["Text", "Label", "Confidence"], | |
| datatype=["str", "str", "number"], | |
| interactive=False, | |
| label="Detected Entities" | |
| ) | |
| # ------------------------- | |
| # Events | |
| # ------------------------- | |
| run_button.click( | |
| analyze_text, | |
| inputs=[text_input, model_selector], | |
| outputs=[highlighted_output, entity_table] | |
| ) | |
| model_selector.change( | |
| display_model_info, | |
| inputs=model_selector, | |
| outputs=[code_output, link_output] | |
| ) | |
| demo.load( | |
| display_model_info, | |
| inputs=model_selector, | |
| outputs=[code_output, link_output] | |
| ) | |
| # --------------------------------------------------- | |
| # Launch | |
| # --------------------------------------------------- | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860 | |
| ) |