import gradio as gr from gradio import update import stanza import pandas as pd import requests import traceback from pathlib import Path # 1. MODEL VARIANTS & INITIALIZATION LESBIAN_MODELS = {} MODEL_VARIANTS = { "Lesbian-only (UD_Greek-Lesbian)": "sbompolas/Lesbian-Greek-Morphosyntactic-Model", "Lesbian-augmented (UD_Greek-Lesbian+NGUD)": "sbompolas/NGUD-Lesbian-Morphosyntactic-Model", "Standard Modern Greek (UD_Greek-GUD)": "sbompolas/GUD", "Cretan-only (UD_Greek-Cretan)": "sbompolas/Cretan" } def download_model_file(url, filename): try: resp = requests.get(url, stream=True) resp.raise_for_status() with open(filename, "wb") as f: for chunk in resp.iter_content(8192): f.write(chunk) return True except Exception as e: print(f"Download failed {filename}: {e}") return False def initialize_models(): try: base = Path("./models") base.mkdir(exist_ok=True) for name, repo in MODEL_VARIANTS.items(): out = base / name out.mkdir(exist_ok=True) files = { "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt", "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt", "pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt", "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt", } for fn, url in files.items(): tgt = out / fn if not tgt.exists() and not download_model_file(url, str(tgt)): return False, f"Failed to download {fn} for {name}" cfg = { 'processors': 'tokenize,pos,lemma,depparse', 'lang': 'el', 'use_gpu': False, 'verbose': False, 'tokenize_model_path': str(out/"tokenizer.pt"), 'pos_model_path': str(out/"pos.pt"), 'lemma_model_path': str(out/"lemmatizer.pt"), 'depparse_model_path': str(out/"depparse.pt") } LESBIAN_MODELS[name] = stanza.Pipeline(**cfg) return True, "Models loaded" except Exception as e: traceback.print_exc() return False, str(e) loaded, load_status = initialize_models() # 2. CONLL-U / DATAFRAME def stanza_doc_to_conllu(doc) -> str: lines = [] for sid, sent in enumerate(doc.sentences, 1): lines.append(f"# sent_id = {sid}") lines.append(f"# text = {sent.text}") for w in sent.words: fields = [ str(w.id), w.text, w.lemma or "_", w.upos or "_", w.xpos or "_", w.feats or "_", str(w.head) if w.head is not None else "0", w.deprel or "_", "_", "_" ] lines.append("\t".join(fields)) lines.append("") # blank line after each sentence return "\n".join(lines) def conllu_to_dataframe(conllu: str) -> pd.DataFrame: cols = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC'] rows = [] for line in conllu.splitlines(): if not line: # empty separator row rows.append({c: "" for c in cols}) continue if line.startswith("#"): key, val = line[2:].split("=", 1) key, val = key.strip(), val.strip() if key == "sent_id": rows.append({'ID': f"# sent_id = {val}", 'FORM': ""}) elif key == "text": rows.append({'ID': f"# text = {val}", 'FORM': ""}) continue parts = line.split("\t") if len(parts) >= 10: rows.append(dict(zip(cols, parts))) return pd.DataFrame(rows, columns=cols).fillna("") # 3. FULL SVG BUILDER (crop top + bottom padding, arrows at start) def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1): try: df = pd.DataFrame(sentence_data) if isinstance(sentence_data, list) else sentence_data n = len(df) base_w, min_sp = 100, 30 spacing = max(base_w, (n*base_w + (n-1)*min_sp)/n) width = max(800, n*spacing + 100) orig_height = 500 crop_top = 30 # px to remove from top bottom_pad = 30 # px to add at bottom height = orig_height - crop_top + bottom_pad word_y = height - 120 feats_y = word_y + 35 colors = { 'root':'#000000','nsubj':'#2980b9','obj':'#27ae60','det':'#e67e22', 'amod':'#8e44ad','nmod':'#16a085','case':'#34495e','punct':'#7f8c8d', 'cc':'#d35400','conj':'#2c3e50','cop':'#e74c3c','mark':'#9b59b6', 'csubj':'#3498db','xcomp':'#1abc9c','ccomp':'#f39c12','advcl':'#e91e63', 'advmod':'#9c27b0','obl':'#795548','iobj':'#607d8b','fixed':'#ff5722', 'aux':'#ff9800','acl':'#4caf50','appos':'#673ab7','compound':'#009688' } svg = [ f'') return "".join(svg) except Exception as e: return f"
Error creating SVG: {e}
" # 4. PROCESS & DROPDOWN-UPDATES def process_text(text, variant): if not text.strip(): return ( gr.HTML.update(value="No data
"), gr.Dropdown.update(choices=[], value=None), [], "", pd.DataFrame() ) pipe = LESBIAN_MODELS.get(variant) if pipe is None: return ( gr.HTML.update(value="Error: model not loaded
"), gr.Dropdown.update(choices=[], value=None), [], "", pd.DataFrame() ) doc = pipe(text) conllu = stanza_doc_to_conllu(doc) df = conllu_to_dataframe(conllu) sentences = [] for sent in doc.sentences: payload = [{ 'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_", 'UPOS': w.upos or "_", 'XPOS': w.xpos or "_", 'FEATS': w.feats or "_", 'HEAD': w.head or 0, 'DEPREL': w.deprel or "_" } for w in sent.words] sentences.append(payload) sent_ids = [str(i+1) for i in range(len(sentences))] dd_upd = update(choices=sent_ids, value=sent_ids[0] if sent_ids else None) init_svg = create_single_sentence_svg(sentences[0]) if sentences else "No data
" return init_svg, dd_upd, sentences, conllu, df def update_svg(selected_id, sentences): try: idx = int(selected_id) - 1 return create_single_sentence_svg(sentences[idx]) except: return "Invalid selection
" # 5. BUILD GRADIO UI def create_app(): with gr.Blocks(title="Parser for MG Dialects") as app: gr.Markdown("# Morphosyntactic Parser for MG Dialects") if not loaded: gr.Markdown(f"❌ Load error: {load_status}") with gr.Row(): with gr.Column(): txt = gr.Textbox(label="Input Text", lines=4, placeholder="Εισάγετε κείμενο…") mdl = gr.Radio(choices=list(MODEL_VARIANTS.keys()), value="Lesbian-only", label="Model Variant") btn = gr.Button("Parse", variant="primary") with gr.Row(): with gr.Column(): svg_out = gr.HTML("No visualization
") sentence_dd = gr.Dropdown(label="Choose sentence", choices=[]) sentences_st = gr.State([]) with gr.Row(): with gr.Column(): conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True) table_out = gr.Dataframe(label="Token Table") btn.click( fn=process_text, inputs=[txt, mdl], outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out] ) sentence_dd.change(fn=update_svg, inputs=[sentence_dd, sentences_st], outputs=svg_out) return app if __name__ == "__main__": create_app().launch()