File size: 10,605 Bytes
6584989
a31fd3a
6584989
a31fd3a
594de82
a31fd3a
594de82
 
 
7c078c8
 
b7873bd
6584989
dcfa759
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c41b42
 
dcfa759
 
2c41b42
 
dcfa759
594de82
8c6f172
594de82
 
 
 
 
 
 
 
 
aa54010
594de82
 
 
aa54010
594de82
aa54010
594de82
aa54010
594de82
aa54010
 
 
 
 
 
594de82
 
 
 
 
aa54010
 
 
 
 
 
594de82
aa54010
594de82
 
 
 
 
 
ccd0619
7c078c8
aa54010
bafc7d8
a4967b9
 
 
 
aa54010
a4967b9
ccd0619
594de82
aa54010
 
a4967b9
594de82
 
 
 
aa54010
a4967b9
594de82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c078c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5b2433
7c078c8
 
b5b2433
2c41b42
088a901
2c41b42
dcfa759
7c078c8
594de82
 
 
 
 
bac6ef6
594de82
 
 
7c078c8
594de82
 
7c078c8
 
 
 
 
 
 
 
 
 
594de82
 
 
 
 
7c078c8
0d56a84
 
 
 
 
 
594de82
 
 
 
a31fd3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e36e22
9d5eff9
6584989
a31fd3a
 
594de82
 
4e8239b
 
17ecb7e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
import csv
import os
import sys

import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from lemmatizer import Lemmatizer

#csv.field_size_limit(sys.maxsize)
csv.field_size_limit(csv.field_size_limit(2**31 - 1) )


def load_readme():
    """Load README.md content and strip YAML frontmatter."""
    readme_path = os.path.join(os.path.dirname(__file__), "README.md")
    with open(readme_path, "r", encoding="utf-8") as file:
        content = file.read()
    
    # Strip YAML frontmatter (content between --- markers)
    if content.startswith("---"):
        lines = content.split("\n")
        frontmatter_end = None
        for index, line in enumerate(lines[1:], start=1):
            if line.strip() == "---":
                frontmatter_end = index
                break
        
        if frontmatter_end is not None:
            content = "\n".join(lines[frontmatter_end + 1:]).lstrip("\n")
    
    return content

IMAGE_URL = "https://huggingface.co/spaces/ZurichNLP/rumlem/resolve/main/illustration.png"
IMAGE_PLACEHOLDER = "IMAGE_PLACEHOLDER"

readme_content = load_readme()
readme_before_image, readme_after_image = readme_content.split(IMAGE_PLACEHOLDER, 1)


if gr.NO_RELOAD:
    lemmatizer = Lemmatizer(learned_et=False)

def process_text(text):
    doc = lemmatizer(text)
    
    idiom_scores = doc.idiom_scores
    detected_idiom = doc.idiom.value
    
    # Create a list to store token analyses
    token_analyses = []

    for token in doc.tokens:
        token_info = {
            "token": token.text,
            "lemmas": {}
        }

        for lemma, analyses in token.lemmas.items():
            # Initialize lemma entry
            if lemma.text not in token_info["lemmas"]:
                token_info["lemmas"][lemma.text] = {
                    "analyses": [],
                    "translations": []
                }

            # Collect analyses
            for analysis in analyses:
                try:
                    analysis_str = str(analysis)
                except AttributeError:
                    analysis_str = "-"
                token_info["lemmas"][lemma.text]["analyses"].append(analysis_str)

            # Collect lemma-specific translation
            if getattr(lemma, "translation_de", None) and lemma.translation_de != "null":
                token_info["lemmas"][lemma.text]["translations"].append(lemma.translation_de)

        token_analyses.append(token_info)

    
    # Create DataFrame for token analysis
    df_tokens = pd.DataFrame([
        {
            "Token": t["token"],
            "Lemma": "<br>".join([f"<b>{lemma}</b>" for lemma in t["lemmas"].keys()]),
            "German translations": "<br>".join([
                f"<b>{lemma}</b>:\n" +
                "<br>".join([
                    f"<span style='font-style: italic; color: #4A90D9; font-weight: bold;'>{tr}</span>"
                    for tr in sorted(
                        lem_data["translations"], 
                        key=lambda x: (len(x), x.lower())
                    )[:10]  # limit to 10 translations per lemma
                ])
                for lemma, lem_data in t["lemmas"].items() if lem_data["translations"]
            ]),
            "Morphological Analysis": "<br>".join([
                f"<b>{lemma}</b>: " +
                "<br>".join(sorted(set(lem_data["analyses"])))
                for lemma, lem_data in t["lemmas"].items() if lem_data["analyses"]
            ])
        }
        for t in token_analyses
    ])


    
    # Create bar chart data for idiom scores using plotly
    
    # Define idiom display names and order
    idiom_map = {
        "rm-rumgr": "Rumantsch Grischun",
        "rm-sursilv": "Sursilvan",
        "rm-sutsilv": "Sutsilvan",
        "rm-surmiran": "Surmiran",
        "rm-puter": "Puter",
        "rm-vallader": "Vallader",
    }
    
    # Create ordered list of idioms (reversed for display since the chart plots from bottom to top)
    ordered_idioms = ["rm-vallader", "rm-puter", "rm-surmiran", "rm-sutsilv", "rm-sursilv", "rm-rumgr"]
    
    # Create ordered data for the chart
    ordered_data = []
    for idiom_code in ordered_idioms:
        # Find the corresponding Idiom enum value in the keys
        matching_idioms = [i for i in idiom_scores.keys() if i.value == idiom_code]
        if matching_idioms:
            score = idiom_scores[matching_idioms[0]]
            ordered_data.append({
                "idiom_code": idiom_code,
                "idiom_name": idiom_map[idiom_code],
                "score": round(score * 100, 1)
            })
    
    # Extract values for plotting
    idiom_display_names = [item["idiom_name"] for item in ordered_data]
    score_values = [item["score"] for item in ordered_data]
    idiom_codes = [item["idiom_code"] for item in ordered_data]
    
    # Set colors based on detected idiom
    colors = ["#3062FF" if code == detected_idiom else "#BDC9E8" for code in idiom_codes]
    
    fig = go.Figure(data=[
        go.Bar(
            y=idiom_display_names,  # Use display names for idioms
            x=score_values,
            marker_color=colors,
            orientation='h',  # Set horizontal orientation
            width=0.4  # Make bars narrower (height in horizontal orientation)
        )
    ])
    
    fig.update_layout(
        height=400,
        plot_bgcolor='#FAFAFA',
        paper_bgcolor='#FAFAFA',
        xaxis=dict(
            title="(Number of words found in Pledari Grond)",
            title_font=dict(
                family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
                color='rgb(39, 39, 42)',
                size=12
            ),
            tickformat='.1f',  # Format tick labels with 1 decimal place
            ticksuffix='%',     # Add % suffix to tick labels
            tickfont=dict(
                family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
                color='rgb(39, 39, 42)'
            )
        ),
        yaxis=dict(
            ticksuffix=' ',   # Add space between idiom labels and bars
            tickfont=dict(
                family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
                color='rgb(39, 39, 42)'
            )
        ),
        font=dict(
            family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
            color='rgb(39, 39, 42)'
        ),
    )
    
    # Update hover template to show percentages
    fig.update_traces(
        hovertemplate='%{y}: %{x:.1f}%<extra></extra>'
    )
    
    # No need to return detected idiom anymore
    return fig, df_tokens

with gr.Blocks(
    title="Lemmatizer",
    css="""
    /* ===== Table Styling ===== */
    #full-width-table .wrap.svelte-drum8y, 
    #full-width-table table {
        width: 100% !important;
        table-layout: auto !important;
    }

    #full-width-table td, 
    #full-width-table th {
        white-space: nowrap !important;
    }

    /* === Specific column width adjustments === */
    #full-width-table table th:nth-child(1),
    #full-width-table table td:nth-child(1) {
        min-width: 200px !important; /* Word column */
    }

    #full-width-table table th:nth-child(2),
    #full-width-table table td:nth-child(2) {
        min-width: 200px !important; /* Lemma column */
    }

    #full-width-table table th:nth-child(3),
    #full-width-table table td:nth-child(3) {
        min-width: 200px !important; /* German translations column */
    }

    #full-width-table table th:nth-child(4),
    #full-width-table table td:nth-child(4) {
        min-width: 300px !important; /* Morphological Analysis column */
    }

    /* ===== Input box height control ===== */
    #input-box {
        display: flex !important;
        flex-direction: column !important;
        height: 360px !important; /* visually matches plot height ~400px */
        overflow: hidden !important;
    }

    #input-box textarea {
        flex-grow: 1 !important;
        height: 100% !important;
        max-height: 100% !important;
        overflow-y: auto !important;
        resize: none !important;
    }

    """
) as demo:


    gr.Markdown(
        "# RUMLEM - Romansh Lemmatizer Demo"
    )

    with gr.Accordion("About This Demo", open=False):
        gr.Markdown(readme_before_image)
        gr.Image(IMAGE_URL, width=500, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False)
        gr.Markdown(readme_after_image)

    # === Top Row: Input & Chart ===
    with gr.Row():
        with gr.Column(scale=1):
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="Enter Romansh text here...",
                value="La vulp era puspè ina giada fomentada.",
                lines=5
            )
            submit_btn = gr.Button("Analyze")

        with gr.Column(scale=2):
            idiom_chart = gr.Plot(label="Detected Idioms")

    # === Bottom Row: Full-width Table ===
    token_table = gr.DataFrame(
        label="Analysis of Words",
        datatype="markdown",
        wrap=False,  # prevent Gradio from wrapping text
        elem_id="full-width-table"
    )

    # === Function Hook ===
    submit_btn.click(
        fn=process_text,
        inputs=[text_input],
        outputs=[idiom_chart, token_table]
    )

    demo.load(
        fn=process_text,
        inputs=[text_input],
        outputs=[idiom_chart, token_table],
    )

    
    # Add examples from TSV file
    # Read examples from the TSV file
    tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv")
    # Read the TSV file into a pandas DataFrame
    df = pd.read_csv(tsv_path, sep='\t')

    # Create a list of examples with their idiom labels
    examples_data = []
    for col in df.columns:
        for sentence in df[col].dropna():
            if sentence.strip():  # Skip empty sentences
                examples_data.append((sentence, col))

    # Create the Examples component with idiom labels and sentence content
    examples = [sentence for sentence, _ in examples_data]
    example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data]

    gr.Examples(
        examples=examples,
        inputs=text_input,
        label="Example Sentences",
        example_labels=example_labels,
        examples_per_page=100,
        fn=process_text,
        outputs=[idiom_chart, token_table],
        run_on_click=True,
        cache_examples=False,
        cache_mode='eager',
        preload=0,
    )


if __name__ == "__main__":
    demo.launch(
    #ssr_mode=False
    )