| import csv |
| import os |
| import sys |
|
|
| import gradio as gr |
| import pandas as pd |
| import plotly.graph_objects as go |
| from lemmatizer import Lemmatizer |
|
|
| |
| csv.field_size_limit(csv.field_size_limit(2**31 - 1) ) |
|
|
|
|
| def load_readme(): |
| """Load README.md content and strip YAML frontmatter.""" |
| readme_path = os.path.join(os.path.dirname(__file__), "README.md") |
| with open(readme_path, "r", encoding="utf-8") as file: |
| content = file.read() |
| |
| |
| if content.startswith("---"): |
| lines = content.split("\n") |
| frontmatter_end = None |
| for index, line in enumerate(lines[1:], start=1): |
| if line.strip() == "---": |
| frontmatter_end = index |
| break |
| |
| if frontmatter_end is not None: |
| content = "\n".join(lines[frontmatter_end + 1:]).lstrip("\n") |
| |
| return content |
|
|
| IMAGE_URL = "https://huggingface.co/spaces/ZurichNLP/rumlem/resolve/main/illustration.png" |
| IMAGE_PLACEHOLDER = "IMAGE_PLACEHOLDER" |
|
|
| readme_content = load_readme() |
| readme_before_image, readme_after_image = readme_content.split(IMAGE_PLACEHOLDER, 1) |
|
|
|
|
| if gr.NO_RELOAD: |
| lemmatizer = Lemmatizer(learned_et=False) |
|
|
| def process_text(text): |
| doc = lemmatizer(text) |
| |
| idiom_scores = doc.idiom_scores |
| detected_idiom = doc.idiom.value |
| |
| |
| token_analyses = [] |
|
|
| for token in doc.tokens: |
| token_info = { |
| "token": token.text, |
| "lemmas": {} |
| } |
|
|
| for lemma, analyses in token.lemmas.items(): |
| |
| if lemma.text not in token_info["lemmas"]: |
| token_info["lemmas"][lemma.text] = { |
| "analyses": [], |
| "translations": [] |
| } |
|
|
| |
| for analysis in analyses: |
| try: |
| analysis_str = str(analysis) |
| except AttributeError: |
| analysis_str = "-" |
| token_info["lemmas"][lemma.text]["analyses"].append(analysis_str) |
|
|
| |
| if getattr(lemma, "translation_de", None) and lemma.translation_de != "null": |
| token_info["lemmas"][lemma.text]["translations"].append(lemma.translation_de) |
|
|
| token_analyses.append(token_info) |
|
|
| |
| |
| df_tokens = pd.DataFrame([ |
| { |
| "Token": t["token"], |
| "Lemma": "<br>".join([f"<b>{lemma}</b>" for lemma in t["lemmas"].keys()]), |
| "German translations": "<br>".join([ |
| f"<b>{lemma}</b>:\n" + |
| "<br>".join([ |
| f"<span style='font-style: italic; color: #4A90D9; font-weight: bold;'>{tr}</span>" |
| for tr in sorted( |
| lem_data["translations"], |
| key=lambda x: (len(x), x.lower()) |
| )[:10] |
| ]) |
| for lemma, lem_data in t["lemmas"].items() if lem_data["translations"] |
| ]), |
| "Morphological Analysis": "<br>".join([ |
| f"<b>{lemma}</b>: " + |
| "<br>".join(sorted(set(lem_data["analyses"]))) |
| for lemma, lem_data in t["lemmas"].items() if lem_data["analyses"] |
| ]) |
| } |
| for t in token_analyses |
| ]) |
|
|
|
|
| |
| |
| |
| |
| idiom_map = { |
| "rm-rumgr": "Rumantsch Grischun", |
| "rm-sursilv": "Sursilvan", |
| "rm-sutsilv": "Sutsilvan", |
| "rm-surmiran": "Surmiran", |
| "rm-puter": "Puter", |
| "rm-vallader": "Vallader", |
| } |
| |
| |
| ordered_idioms = ["rm-vallader", "rm-puter", "rm-surmiran", "rm-sutsilv", "rm-sursilv", "rm-rumgr"] |
| |
| |
| ordered_data = [] |
| for idiom_code in ordered_idioms: |
| |
| matching_idioms = [i for i in idiom_scores.keys() if i.value == idiom_code] |
| if matching_idioms: |
| score = idiom_scores[matching_idioms[0]] |
| ordered_data.append({ |
| "idiom_code": idiom_code, |
| "idiom_name": idiom_map[idiom_code], |
| "score": round(score * 100, 1) |
| }) |
| |
| |
| idiom_display_names = [item["idiom_name"] for item in ordered_data] |
| score_values = [item["score"] for item in ordered_data] |
| idiom_codes = [item["idiom_code"] for item in ordered_data] |
| |
| |
| colors = ["#3062FF" if code == detected_idiom else "#BDC9E8" for code in idiom_codes] |
| |
| fig = go.Figure(data=[ |
| go.Bar( |
| y=idiom_display_names, |
| x=score_values, |
| marker_color=colors, |
| orientation='h', |
| width=0.4 |
| ) |
| ]) |
| |
| fig.update_layout( |
| height=400, |
| plot_bgcolor='#FAFAFA', |
| paper_bgcolor='#FAFAFA', |
| xaxis=dict( |
| title="(Number of words found in Pledari Grond)", |
| title_font=dict( |
| family='"IBM Plex Mono", ui-monospace, Consolas, monospace', |
| color='rgb(39, 39, 42)', |
| size=12 |
| ), |
| tickformat='.1f', |
| ticksuffix='%', |
| tickfont=dict( |
| family='"IBM Plex Mono", ui-monospace, Consolas, monospace', |
| color='rgb(39, 39, 42)' |
| ) |
| ), |
| yaxis=dict( |
| ticksuffix=' ', |
| tickfont=dict( |
| family='"IBM Plex Mono", ui-monospace, Consolas, monospace', |
| color='rgb(39, 39, 42)' |
| ) |
| ), |
| font=dict( |
| family='"IBM Plex Mono", ui-monospace, Consolas, monospace', |
| color='rgb(39, 39, 42)' |
| ), |
| ) |
| |
| |
| fig.update_traces( |
| hovertemplate='%{y}: %{x:.1f}%<extra></extra>' |
| ) |
| |
| |
| return fig, df_tokens |
|
|
| with gr.Blocks( |
| title="Lemmatizer", |
| css=""" |
| /* ===== Table Styling ===== */ |
| #full-width-table .wrap.svelte-drum8y, |
| #full-width-table table { |
| width: 100% !important; |
| table-layout: auto !important; |
| } |
| |
| #full-width-table td, |
| #full-width-table th { |
| white-space: nowrap !important; |
| } |
| |
| /* === Specific column width adjustments === */ |
| #full-width-table table th:nth-child(1), |
| #full-width-table table td:nth-child(1) { |
| min-width: 200px !important; /* Word column */ |
| } |
| |
| #full-width-table table th:nth-child(2), |
| #full-width-table table td:nth-child(2) { |
| min-width: 200px !important; /* Lemma column */ |
| } |
| |
| #full-width-table table th:nth-child(3), |
| #full-width-table table td:nth-child(3) { |
| min-width: 200px !important; /* German translations column */ |
| } |
| |
| #full-width-table table th:nth-child(4), |
| #full-width-table table td:nth-child(4) { |
| min-width: 300px !important; /* Morphological Analysis column */ |
| } |
| |
| /* ===== Input box height control ===== */ |
| #input-box { |
| display: flex !important; |
| flex-direction: column !important; |
| height: 360px !important; /* visually matches plot height ~400px */ |
| overflow: hidden !important; |
| } |
| |
| #input-box textarea { |
| flex-grow: 1 !important; |
| height: 100% !important; |
| max-height: 100% !important; |
| overflow-y: auto !important; |
| resize: none !important; |
| } |
| |
| """ |
| ) as demo: |
|
|
|
|
| gr.Markdown( |
| "# RUMLEM - Romansh Lemmatizer Demo" |
| ) |
|
|
| with gr.Accordion("About This Demo", open=False): |
| gr.Markdown(readme_before_image) |
| gr.Image(IMAGE_URL, width=500, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False) |
| gr.Markdown(readme_after_image) |
|
|
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| text_input = gr.Textbox( |
| label="Input Text", |
| placeholder="Enter Romansh text here...", |
| value="La vulp era puspè ina giada fomentada.", |
| lines=5 |
| ) |
| submit_btn = gr.Button("Analyze") |
|
|
| with gr.Column(scale=2): |
| idiom_chart = gr.Plot(label="Detected Idioms") |
|
|
| |
| token_table = gr.DataFrame( |
| label="Analysis of Words", |
| datatype="markdown", |
| wrap=False, |
| elem_id="full-width-table" |
| ) |
|
|
| |
| submit_btn.click( |
| fn=process_text, |
| inputs=[text_input], |
| outputs=[idiom_chart, token_table] |
| ) |
|
|
| demo.load( |
| fn=process_text, |
| inputs=[text_input], |
| outputs=[idiom_chart, token_table], |
| ) |
|
|
| |
| |
| |
| tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv") |
| |
| df = pd.read_csv(tsv_path, sep='\t') |
|
|
| |
| examples_data = [] |
| for col in df.columns: |
| for sentence in df[col].dropna(): |
| if sentence.strip(): |
| examples_data.append((sentence, col)) |
|
|
| |
| examples = [sentence for sentence, _ in examples_data] |
| example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data] |
|
|
| gr.Examples( |
| examples=examples, |
| inputs=text_input, |
| label="Example Sentences", |
| example_labels=example_labels, |
| examples_per_page=100, |
| fn=process_text, |
| outputs=[idiom_chart, token_table], |
| run_on_click=True, |
| cache_examples=False, |
| cache_mode='eager', |
| preload=0, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| demo.launch( |
| |
| ) |
|
|