import gradio as gr from gradio import update import stanza import pandas as pd import requests import traceback from pathlib import Path # 1. MODEL VARIANTS & INITIALIZATION LESBIAN_MODELS = {} MODEL_VARIANTS = { "Lesbian-only (UD_Greek-Lesbian)": "sbompolas/Lesbian-Greek-Morphosyntactic-Model", "Lesbian-augmented (UD_Greek-Lesbian+NGUD)": "sbompolas/NGUD-Lesbian-Morphosyntactic-Model", "Standard Modern Greek (UD_Greek-GUD)": "sbompolas/GUD", "Cretan-only (UD_Greek-Cretan)": "sbompolas/Cretan" } def download_model_file(url, filename): try: resp = requests.get(url, stream=True) resp.raise_for_status() with open(filename, "wb") as f: for chunk in resp.iter_content(8192): f.write(chunk) return True except Exception as e: print(f"Download failed {filename}: {e}") return False def initialize_models(): try: base = Path("./models") base.mkdir(exist_ok=True) for name, repo in MODEL_VARIANTS.items(): out = base / name out.mkdir(exist_ok=True) files = { "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt", "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt", "pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt", "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt", } for fn, url in files.items(): tgt = out / fn if not tgt.exists() and not download_model_file(url, str(tgt)): return False, f"Failed to download {fn} for {name}" cfg = { 'processors': 'tokenize,pos,lemma,depparse', 'lang': 'el', 'use_gpu': False, 'verbose': False, 'tokenize_model_path': str(out/"tokenizer.pt"), 'pos_model_path': str(out/"pos.pt"), 'lemma_model_path': str(out/"lemmatizer.pt"), 'depparse_model_path': str(out/"depparse.pt") } LESBIAN_MODELS[name] = stanza.Pipeline(**cfg) return True, "Models loaded" except Exception as e: traceback.print_exc() return False, str(e) loaded, load_status = initialize_models() # 2. CONLL-U / DATAFRAME def stanza_doc_to_conllu(doc) -> str: lines = [] for sid, sent in enumerate(doc.sentences, 1): lines.append(f"# sent_id = {sid}") lines.append(f"# text = {sent.text}") for w in sent.words: fields = [ str(w.id), w.text, w.lemma or "_", w.upos or "_", w.xpos or "_", w.feats or "_", str(w.head) if w.head is not None else "0", w.deprel or "_", "_", "_" ] lines.append("\t".join(fields)) lines.append("") # blank line after each sentence return "\n".join(lines) def conllu_to_dataframe(conllu: str) -> pd.DataFrame: cols = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC'] rows = [] for line in conllu.splitlines(): if not line: # empty separator row rows.append({c: "" for c in cols}) continue if line.startswith("#"): key, val = line[2:].split("=", 1) key, val = key.strip(), val.strip() if key == "sent_id": rows.append({'ID': f"# sent_id = {val}", 'FORM': ""}) elif key == "text": rows.append({'ID': f"# text = {val}", 'FORM': ""}) continue parts = line.split("\t") if len(parts) >= 10: rows.append(dict(zip(cols, parts))) return pd.DataFrame(rows, columns=cols).fillna("") # 3. FULL SVG BUILDER (crop top + bottom padding, arrows at start) def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1): try: df = pd.DataFrame(sentence_data) if isinstance(sentence_data, list) else sentence_data n = len(df) base_w, min_sp = 100, 30 spacing = max(base_w, (n*base_w + (n-1)*min_sp)/n) width = max(800, n*spacing + 100) orig_height = 500 crop_top = 30 # px to remove from top bottom_pad = 30 # px to add at bottom height = orig_height - crop_top + bottom_pad word_y = height - 120 feats_y = word_y + 35 colors = { 'root':'#000000','nsubj':'#2980b9','obj':'#27ae60','det':'#e67e22', 'amod':'#8e44ad','nmod':'#16a085','case':'#34495e','punct':'#7f8c8d', 'cc':'#d35400','conj':'#2c3e50','cop':'#e74c3c','mark':'#9b59b6', 'csubj':'#3498db','xcomp':'#1abc9c','ccomp':'#f39c12','advcl':'#e91e63', 'advmod':'#9c27b0','obl':'#795548','iobj':'#607d8b','fixed':'#ff5722', 'aux':'#ff9800','acl':'#4caf50','appos':'#673ab7','compound':'#009688' } svg = [ f'' ] for rel, c in colors.items(): svg.append( f'' f'' ) svg.append('') # compute x positions xpos = { int(r['ID']): 50 + (int(r['ID']) - 1) * spacing for _, r in df.iterrows() if str(r['ID']).isdigit() } used_spans = [] for _, r in df.iterrows(): if not str(r['ID']).isdigit(): continue i, h = int(r['ID']), int(r['HEAD']) rel, c = r['DEPREL'], colors.get(r['DEPREL'], '#000') x1 = xpos[i] if h == 0: # ROOT line svg.append( f'' ) mid = (word_y-15 + 50) / 2 svg.append( f'' ) svg.append( f'ROOT' ) else: x2 = xpos.get(h, x1) span = (min(i, h), max(i, h)) lvl = 0 conflict = True while conflict: conflict = False for (es, el), used_lvl in used_spans: if used_lvl == lvl and not (span[1] < es or span[0] > el): lvl += 1 conflict = True break used_spans.append((span, lvl)) dist = abs(x2 - x1) arc_h = min(40 + dist * 0.15, 100) + lvl * 35 midx, cty = (x1 + x2) / 2, word_y - arc_h path_d = f'M{x1} {word_y-15} Q{midx} {cty} {x2} {word_y-15}' svg.append( f'' ) amx = 0.25*x1 + 0.5*midx + 0.25*x2 amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15) lw = len(rel)*6 + 8 svg.append( f'' ) svg.append( f'{rel}' ) # draw tokens + annotations for _, r in df.iterrows(): if not str(r['ID']).isdigit(): continue x = xpos[int(r['ID'])] svg.append( f'{r["FORM"]}' ) ann = [] if r['UPOS'] and r['UPOS'] != '_': ann.append(f"upos={r['UPOS']}") if r['LEMMA'] not in ('_', r['FORM']): ann.append(f"lemma={r['LEMMA']}") if r['FEATS'] and r['FEATS'] not in ('', '_'): for f in r['FEATS'].split('|'): if '=' in f: ann.append(f) for i, a in enumerate(ann): svg.append( f'{a}' ) svg.append('') return "".join(svg) except Exception as e: return f"

Error creating SVG: {e}

" # 4. PROCESS & DROPDOWN-UPDATES def process_text(text, variant): if not text.strip(): return ( gr.HTML.update(value="

No data

"), gr.Dropdown.update(choices=[], value=None), [], "", pd.DataFrame() ) pipe = LESBIAN_MODELS.get(variant) if pipe is None: return ( gr.HTML.update(value="

Error: model not loaded

"), gr.Dropdown.update(choices=[], value=None), [], "", pd.DataFrame() ) doc = pipe(text) conllu = stanza_doc_to_conllu(doc) df = conllu_to_dataframe(conllu) sentences = [] for sent in doc.sentences: payload = [{ 'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_", 'UPOS': w.upos or "_", 'XPOS': w.xpos or "_", 'FEATS': w.feats or "_", 'HEAD': w.head or 0, 'DEPREL': w.deprel or "_" } for w in sent.words] sentences.append(payload) sent_ids = [str(i+1) for i in range(len(sentences))] dd_upd = update(choices=sent_ids, value=sent_ids[0] if sent_ids else None) init_svg = create_single_sentence_svg(sentences[0]) if sentences else "

No data

" return init_svg, dd_upd, sentences, conllu, df def update_svg(selected_id, sentences): try: idx = int(selected_id) - 1 return create_single_sentence_svg(sentences[idx]) except: return "

Invalid selection

" # 5. BUILD GRADIO UI def create_app(): with gr.Blocks(title="Parser for MG Dialects") as app: gr.Markdown("# Morphosyntactic Parser for MG Dialects") if not loaded: gr.Markdown(f"❌ Load error: {load_status}") with gr.Row(): with gr.Column(): txt = gr.Textbox(label="Input Text", lines=4, placeholder="Εισάγετε κείμενο…") mdl = gr.Radio(choices=list(MODEL_VARIANTS.keys()), value="Lesbian-only", label="Model Variant") btn = gr.Button("Parse", variant="primary") with gr.Row(): with gr.Column(): svg_out = gr.HTML("

No visualization

") sentence_dd = gr.Dropdown(label="Choose sentence", choices=[]) sentences_st = gr.State([]) with gr.Row(): with gr.Column(): conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True) table_out = gr.Dataframe(label="Token Table") btn.click( fn=process_text, inputs=[txt, mdl], outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out] ) sentence_dd.change(fn=update_svg, inputs=[sentence_dd, sentences_st], outputs=svg_out) return app if __name__ == "__main__": create_app().launch()