File size: 10,605 Bytes
6584989 a31fd3a 6584989 a31fd3a 594de82 a31fd3a 594de82 7c078c8 b7873bd 6584989 dcfa759 2c41b42 dcfa759 2c41b42 dcfa759 594de82 8c6f172 594de82 aa54010 594de82 aa54010 594de82 aa54010 594de82 aa54010 594de82 aa54010 594de82 aa54010 594de82 aa54010 594de82 ccd0619 7c078c8 aa54010 bafc7d8 a4967b9 aa54010 a4967b9 ccd0619 594de82 aa54010 a4967b9 594de82 aa54010 a4967b9 594de82 7c078c8 b5b2433 7c078c8 b5b2433 2c41b42 088a901 2c41b42 dcfa759 7c078c8 594de82 bac6ef6 594de82 7c078c8 594de82 7c078c8 594de82 7c078c8 0d56a84 594de82 a31fd3a 4e36e22 9d5eff9 6584989 a31fd3a 594de82 4e8239b 17ecb7e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 | import csv
import os
import sys
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from lemmatizer import Lemmatizer
#csv.field_size_limit(sys.maxsize)
csv.field_size_limit(csv.field_size_limit(2**31 - 1) )
def load_readme():
"""Load README.md content and strip YAML frontmatter."""
readme_path = os.path.join(os.path.dirname(__file__), "README.md")
with open(readme_path, "r", encoding="utf-8") as file:
content = file.read()
# Strip YAML frontmatter (content between --- markers)
if content.startswith("---"):
lines = content.split("\n")
frontmatter_end = None
for index, line in enumerate(lines[1:], start=1):
if line.strip() == "---":
frontmatter_end = index
break
if frontmatter_end is not None:
content = "\n".join(lines[frontmatter_end + 1:]).lstrip("\n")
return content
IMAGE_URL = "https://huggingface.co/spaces/ZurichNLP/rumlem/resolve/main/illustration.png"
IMAGE_PLACEHOLDER = "IMAGE_PLACEHOLDER"
readme_content = load_readme()
readme_before_image, readme_after_image = readme_content.split(IMAGE_PLACEHOLDER, 1)
if gr.NO_RELOAD:
lemmatizer = Lemmatizer(learned_et=False)
def process_text(text):
doc = lemmatizer(text)
idiom_scores = doc.idiom_scores
detected_idiom = doc.idiom.value
# Create a list to store token analyses
token_analyses = []
for token in doc.tokens:
token_info = {
"token": token.text,
"lemmas": {}
}
for lemma, analyses in token.lemmas.items():
# Initialize lemma entry
if lemma.text not in token_info["lemmas"]:
token_info["lemmas"][lemma.text] = {
"analyses": [],
"translations": []
}
# Collect analyses
for analysis in analyses:
try:
analysis_str = str(analysis)
except AttributeError:
analysis_str = "-"
token_info["lemmas"][lemma.text]["analyses"].append(analysis_str)
# Collect lemma-specific translation
if getattr(lemma, "translation_de", None) and lemma.translation_de != "null":
token_info["lemmas"][lemma.text]["translations"].append(lemma.translation_de)
token_analyses.append(token_info)
# Create DataFrame for token analysis
df_tokens = pd.DataFrame([
{
"Token": t["token"],
"Lemma": "<br>".join([f"<b>{lemma}</b>" for lemma in t["lemmas"].keys()]),
"German translations": "<br>".join([
f"<b>{lemma}</b>:\n" +
"<br>".join([
f"<span style='font-style: italic; color: #4A90D9; font-weight: bold;'>{tr}</span>"
for tr in sorted(
lem_data["translations"],
key=lambda x: (len(x), x.lower())
)[:10] # limit to 10 translations per lemma
])
for lemma, lem_data in t["lemmas"].items() if lem_data["translations"]
]),
"Morphological Analysis": "<br>".join([
f"<b>{lemma}</b>: " +
"<br>".join(sorted(set(lem_data["analyses"])))
for lemma, lem_data in t["lemmas"].items() if lem_data["analyses"]
])
}
for t in token_analyses
])
# Create bar chart data for idiom scores using plotly
# Define idiom display names and order
idiom_map = {
"rm-rumgr": "Rumantsch Grischun",
"rm-sursilv": "Sursilvan",
"rm-sutsilv": "Sutsilvan",
"rm-surmiran": "Surmiran",
"rm-puter": "Puter",
"rm-vallader": "Vallader",
}
# Create ordered list of idioms (reversed for display since the chart plots from bottom to top)
ordered_idioms = ["rm-vallader", "rm-puter", "rm-surmiran", "rm-sutsilv", "rm-sursilv", "rm-rumgr"]
# Create ordered data for the chart
ordered_data = []
for idiom_code in ordered_idioms:
# Find the corresponding Idiom enum value in the keys
matching_idioms = [i for i in idiom_scores.keys() if i.value == idiom_code]
if matching_idioms:
score = idiom_scores[matching_idioms[0]]
ordered_data.append({
"idiom_code": idiom_code,
"idiom_name": idiom_map[idiom_code],
"score": round(score * 100, 1)
})
# Extract values for plotting
idiom_display_names = [item["idiom_name"] for item in ordered_data]
score_values = [item["score"] for item in ordered_data]
idiom_codes = [item["idiom_code"] for item in ordered_data]
# Set colors based on detected idiom
colors = ["#3062FF" if code == detected_idiom else "#BDC9E8" for code in idiom_codes]
fig = go.Figure(data=[
go.Bar(
y=idiom_display_names, # Use display names for idioms
x=score_values,
marker_color=colors,
orientation='h', # Set horizontal orientation
width=0.4 # Make bars narrower (height in horizontal orientation)
)
])
fig.update_layout(
height=400,
plot_bgcolor='#FAFAFA',
paper_bgcolor='#FAFAFA',
xaxis=dict(
title="(Number of words found in Pledari Grond)",
title_font=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)',
size=12
),
tickformat='.1f', # Format tick labels with 1 decimal place
ticksuffix='%', # Add % suffix to tick labels
tickfont=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)'
)
),
yaxis=dict(
ticksuffix=' ', # Add space between idiom labels and bars
tickfont=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)'
)
),
font=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)'
),
)
# Update hover template to show percentages
fig.update_traces(
hovertemplate='%{y}: %{x:.1f}%<extra></extra>'
)
# No need to return detected idiom anymore
return fig, df_tokens
with gr.Blocks(
title="Lemmatizer",
css="""
/* ===== Table Styling ===== */
#full-width-table .wrap.svelte-drum8y,
#full-width-table table {
width: 100% !important;
table-layout: auto !important;
}
#full-width-table td,
#full-width-table th {
white-space: nowrap !important;
}
/* === Specific column width adjustments === */
#full-width-table table th:nth-child(1),
#full-width-table table td:nth-child(1) {
min-width: 200px !important; /* Word column */
}
#full-width-table table th:nth-child(2),
#full-width-table table td:nth-child(2) {
min-width: 200px !important; /* Lemma column */
}
#full-width-table table th:nth-child(3),
#full-width-table table td:nth-child(3) {
min-width: 200px !important; /* German translations column */
}
#full-width-table table th:nth-child(4),
#full-width-table table td:nth-child(4) {
min-width: 300px !important; /* Morphological Analysis column */
}
/* ===== Input box height control ===== */
#input-box {
display: flex !important;
flex-direction: column !important;
height: 360px !important; /* visually matches plot height ~400px */
overflow: hidden !important;
}
#input-box textarea {
flex-grow: 1 !important;
height: 100% !important;
max-height: 100% !important;
overflow-y: auto !important;
resize: none !important;
}
"""
) as demo:
gr.Markdown(
"# RUMLEM - Romansh Lemmatizer Demo"
)
with gr.Accordion("About This Demo", open=False):
gr.Markdown(readme_before_image)
gr.Image(IMAGE_URL, width=500, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False)
gr.Markdown(readme_after_image)
# === Top Row: Input & Chart ===
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter Romansh text here...",
value="La vulp era puspè ina giada fomentada.",
lines=5
)
submit_btn = gr.Button("Analyze")
with gr.Column(scale=2):
idiom_chart = gr.Plot(label="Detected Idioms")
# === Bottom Row: Full-width Table ===
token_table = gr.DataFrame(
label="Analysis of Words",
datatype="markdown",
wrap=False, # prevent Gradio from wrapping text
elem_id="full-width-table"
)
# === Function Hook ===
submit_btn.click(
fn=process_text,
inputs=[text_input],
outputs=[idiom_chart, token_table]
)
demo.load(
fn=process_text,
inputs=[text_input],
outputs=[idiom_chart, token_table],
)
# Add examples from TSV file
# Read examples from the TSV file
tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv")
# Read the TSV file into a pandas DataFrame
df = pd.read_csv(tsv_path, sep='\t')
# Create a list of examples with their idiom labels
examples_data = []
for col in df.columns:
for sentence in df[col].dropna():
if sentence.strip(): # Skip empty sentences
examples_data.append((sentence, col))
# Create the Examples component with idiom labels and sentence content
examples = [sentence for sentence, _ in examples_data]
example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data]
gr.Examples(
examples=examples,
inputs=text_input,
label="Example Sentences",
example_labels=example_labels,
examples_per_page=100,
fn=process_text,
outputs=[idiom_chart, token_table],
run_on_click=True,
cache_examples=False,
cache_mode='eager',
preload=0,
)
if __name__ == "__main__":
demo.launch(
#ssr_mode=False
)
|