Spaces:

c-ho
/

test_ner_2

Sleeping

App Files Files Community

c-ho commited on 27 days ago

Commit

71f491e

verified ·

1 Parent(s): 216163d

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -180

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import html
 import gradio as gr
 from transformers import pipeline
@@ -40,13 +39,11 @@ model_info = {
     m: {
         "link": f"https://huggingface.co/{m}",
         "usage": f'''from transformers import pipeline
 ner = pipeline(
     "ner",
     model="{m}",
     aggregation_strategy="simple"
 )
 result = ner("Hello world")
 print(result)
 '''
@@ -116,26 +113,38 @@ def merge_subwords(results):
 # ---------------------------------------------------
 def analyze_text(text, model_name):
     ner = get_model(model_name)
     results = ner(text)
     results = merge_subwords(results)
-    entities = []
     table_rows = []
     for ent in results:
         label = ent["entity_group"]
-        entities.append({
-            "start": ent["start"],
-            "end": ent["end"],
-            "label": label,
-        })
         table_rows.append([
             ent["word"],
@@ -143,13 +152,13 @@ def analyze_text(text, model_name):
             round(ent["score"], 3)
         ])
-    rendered_html = render_highlighted_html(
-        text,
-        entities,
-        COLOR_MAP
-    )
-    return rendered_html, table_rows
 # ---------------------------------------------------
 # Entity colors
@@ -159,36 +168,36 @@ COLOR_MAP = {
     # -----------------------------------
     # Academic / theoretical
     # -----------------------------------
-    "AcademicDiscipline": "#5339a8",              # intense purple
-    "AmbiguouslyDefinedConcept": "#ab8fbd",       # muted purple
-    "UnclassifiedLinguisticConcept": "#d4a1c7",   # soft gray-pink
     # -----------------------------------
     # Language / general linguistic
     # -----------------------------------
     "LanguageRelatedTerm": "#E9C46A",             # warm sand yellow
-    "OtherLinguisticTerm": "#b2d1d1",             # pale cyan
-    "LanguageResourceInformation": "#5397c2",     # medium blue
     # -----------------------------------
     # Phonology / graphemics
     # -----------------------------------
-    "PhonologicalPhenomenon": "#eb8167",          # coral red
-    "GraphemicPhenomenon": "#bd9779",             # latte
     # -----------------------------------
     # Morphology / syntax
     # -----------------------------------
-    "MorphologicalPhenomenon": "#37bdac",         # turquoise green
-    "MorphosyntacticPhenomenon": "#43916d",       # medium green
-    "SyntacticPhenomenon": "#53703a",             # darker moss
     # -----------------------------------
     # Lexicon / semantics / discourse
     # -----------------------------------
     "LexicalPhenomenon": "#577590",               # slate blue
     "SemanticPhenomenon": "#4361EE",              # vivid blue
-    "DiscoursePhenomenon": "#3a488c",             # deep blue
     # -----------------------------------
     # Special / misc
@@ -200,154 +209,6 @@ COLOR_MAP = {
     "O": "#FFFFFF"
 }
-def render_highlighted_html(text, entities, color_map):
-    """
-    Creates:
-    - clickable category legend
-    - inline highlighted entities
-    - stable spacing/layout during filtering
-    """
-    escaped_text = html.escape(text)
-    # Sort entities by start position
-    entities = sorted(entities, key=lambda x: x["start"])
-    html_parts = []
-    last_idx = 0
-    for ent in entities:
-        start = ent["start"]
-        end = ent["end"]
-        label = ent["label"]
-        color = color_map.get(label, "#cccccc")
-        # normal text
-        if start > last_idx:
-            html_parts.append(
-                html.escape(text[last_idx:start])
-            )
-        entity_text = html.escape(text[start:end])
-        html_parts.append(f'''
-<span
-    class="entity entity-{label}"
-    data-label="{label}"
-    style="
-        background:{color};
-        padding:2px 4px;
-        margin:1px;
-        border-radius:4px;
-        display:inline-block;
-        white-space:pre-wrap;
-    "
->
-    {entity_text}
-    <span style="
-        font-size:0.7em;
-        opacity:0.75;
-        margin-left:4px;
-    ">
-        {label}
-    </span>
-</span>
-''')
-        last_idx = end
-    # remaining text
-    if last_idx < len(text):
-        html_parts.append(
-            html.escape(text[last_idx:])
-        )
-    categories = sorted(set(ent["label"] for ent in entities))
-    legend_html = ""
-    for cat in categories:
-        color = color_map.get(cat, "#cccccc")
-        legend_html += f'''
-<button
-    class="legend-btn"
-    data-label="{cat}"
-    onclick="toggleCategory('{cat}')"
-    style="
-        background:{color};
-        border:none;
-        padding:6px 10px;
-        margin:4px;
-        border-radius:6px;
-        cursor:pointer;
-        font-weight:600;
-    "
->
-    {cat}
-</button>
-'''
-    final_html = f'''
-<div>
-<div style="margin-bottom:12px;">
-{legend_html}
-</div>
-<div
-    id="annotated-text"
-    style="
-        line-height:2.1;
-        white-space:pre-wrap;
-        font-size:1rem;
-    "
->
-    {''.join(html_parts)}
-</div>
-</div>
-<script>
-let activeCategory = null;
-function toggleCategory(category) {{
-    const entities = document.querySelectorAll('.entity');
-    // second click = restore all
-    if (activeCategory === category) {{
-        activeCategory = null;
-        entities.forEach(el => {{
-            el.style.opacity = '1';
-            el.style.visibility = 'visible';
-        }});
-        return;
-    }}
-    activeCategory = category;
-    entities.forEach(el => {{
-        if (el.dataset.label === category) {{
-            el.style.opacity = '1';
-            el.style.visibility = 'visible';
-        }} else {{
-            // IMPORTANT:
-            // preserve spacing/layout
-            el.style.opacity = '0.15';
-            el.style.visibility = 'visible';
-        }}
-    }});
-}}
-</script>
-'''
-    return final_html
 # ---------------------------------------------------
 # UI
 # ---------------------------------------------------
@@ -357,7 +218,6 @@ with gr.Blocks(title="Linguistic Annotation Demo") as demo:
     gr.Markdown(
         """
 # Linguistic Annotation Demo
 This Space demonstrates custom linguistic sequence tagging models
 for detecting linguistic terminology and phenomena.
 """
@@ -389,16 +249,12 @@ for detecting linguistic terminology and phenomena.
             link_output = gr.Markdown()
-    '''
     highlighted_output = gr.HighlightedText(
         label="Annotated Text",
         combine_adjacent=True,
         color_map=COLOR_MAP,
         show_legend=True
     )
-    '''
-    highlighted_output = gr.HTML(label="Annotated Text")
     entity_table = gr.Dataframe(
         headers=["Text", "Label", "Confidence"],

 import gradio as gr
 from transformers import pipeline
     m: {
         "link": f"https://huggingface.co/{m}",
         "usage": f'''from transformers import pipeline
 ner = pipeline(
     "ner",
     model="{m}",
     aggregation_strategy="simple"
 )
 result = ner("Hello world")
 print(result)
 '''
 # ---------------------------------------------------
 def analyze_text(text, model_name):
     ner = get_model(model_name)
     results = ner(text)
+    # merge subwords first
     results = merge_subwords(results)
+    highlighted_text = []
+    last_idx = 0
     table_rows = []
     for ent in results:
+        start = ent["start"]
+        end = ent["end"]
         label = ent["entity_group"]
+        # Add normal text before entity
+        if start > last_idx:
+            highlighted_text.append(
+                (text[last_idx:start], None)
+            )
+        # Add highlighted entity
+        highlighted_text.append(
+            (text[start:end], label)
+        )
+        last_idx = end
         table_rows.append([
             ent["word"],
             round(ent["score"], 3)
         ])
+    # Add remaining text
+    if last_idx < len(text):
+        highlighted_text.append(
+            (text[last_idx:], None)
+        )
+    return highlighted_text, table_rows
 # ---------------------------------------------------
 # Entity colors
     # -----------------------------------
     # Academic / theoretical
     # -----------------------------------
+    "AcademicDiscipline": "#264653",              # deep teal
+    "AmbiguouslyDefinedConcept": "#6D597A",       # muted purple
+    "UnclassifiedLinguisticConcept": "#9A8C98",   # soft gray-purple
     # -----------------------------------
     # Language / general linguistic
     # -----------------------------------
     "LanguageRelatedTerm": "#E9C46A",             # warm sand yellow
+    "OtherLinguisticTerm": "#A8DADC",             # pale cyan
+    "LanguageResourceInformation": "#457B9D",     # medium blue
     # -----------------------------------
     # Phonology / graphemics
     # -----------------------------------
+    "PhonologicalPhenomenon": "#E76F51",          # coral red
+    "GraphemicPhenomenon": "#F4A261",             # orange
     # -----------------------------------
     # Morphology / syntax
     # -----------------------------------
+    "MorphologicalPhenomenon": "#2A9D8F",         # turquoise green
+    "MorphosyntacticPhenomenon": "#52B788",       # medium green
+    "SyntacticPhenomenon": "#40916C",             # darker green
     # -----------------------------------
     # Lexicon / semantics / discourse
     # -----------------------------------
     "LexicalPhenomenon": "#577590",               # slate blue
     "SemanticPhenomenon": "#4361EE",              # vivid blue
+    "DiscoursePhenomenon": "#B5179E",             # magenta-purple
     # -----------------------------------
     # Special / misc
     "O": "#FFFFFF"
 }
 # ---------------------------------------------------
 # UI
 # ---------------------------------------------------
     gr.Markdown(
         """
 # Linguistic Annotation Demo
 This Space demonstrates custom linguistic sequence tagging models
 for detecting linguistic terminology and phenomena.
 """
             link_output = gr.Markdown()
     highlighted_output = gr.HighlightedText(
         label="Annotated Text",
         combine_adjacent=True,
         color_map=COLOR_MAP,
         show_legend=True
     )
     entity_table = gr.Dataframe(
         headers=["Text", "Label", "Confidence"],