Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

File size: 12,268 Bytes

9391975
45e2824
defdd5b
fcb6cc9
8c61d1b
6394ec4
8c61d1b
a600f21
08f1dd3
 
67b8f64
45e2824
a567e39
 
36170de
51ce54e
45e2824
9a96586
67b8f64
fe64a8e
6394ec4
 
45e2824
6394ec4
fe64a8e
 
 
67b8f64
fe64a8e
 
45e2824
fe64a8e
e4bdfd5
 
67b8f64
 
45e2824
 
67b8f64
 
 
 
45e2824
cbb8031
4670ce3
45e2824
67b8f64
45e2824
 
a97b4db
 
 
45e2824
 
 
67b8f64
45e2824
67b8f64
45e2824
fe64a8e
 
 
9a96586
45e2824
3662a0e
e13e21e
67b8f64
08f1dd3
96e10be
fb5b190
e4bdfd5
fb5b190
6394ec4
 
defdd5b
67b8f64
 
f73ba75
67b8f64
e4bdfd5
defdd5b
fb5b190
67b8f64
fb5b190
96e10be
103058d
4670ce3
cbb8031
08f1dd3
67b8f64
 
4670ce3
383a058
08f1dd3
e13e21e
 
 
 
 
 
cbb8031
08f1dd3
a97b4db
e13e21e
 
 
a9ac3b9
67b8f64
08f1dd3
 
 
 
e13e21e
 
 
 
67b8f64
 
 
 
e13e21e
67b8f64
 
e13e21e
 
67b8f64
e13e21e
 
 
 
 
08f1dd3
 
 
67b8f64
 
08f1dd3
e13e21e
08f1dd3
e13e21e
0eced5a
e13e21e
08f1dd3
67b8f64
08f1dd3
67b8f64
e13e21e
67b8f64
 
4670ce3
08f1dd3
 
e13e21e
67b8f64
 
e13e21e
 
 
 
67b8f64
 
 
 
 
 
 
 
 
 
 
 
 
 
08f1dd3
e13e21e
67b8f64
 
08f1dd3
 
 
67b8f64
 
08f1dd3
 
 
67b8f64
 
 
 
 
08f1dd3
67b8f64
e13e21e
08f1dd3
 
67b8f64
 
 
 
 
 
 
 
 
 
08f1dd3
67b8f64
e13e21e
67b8f64
 
e13e21e
67b8f64
 
 
 
08f1dd3
67b8f64
 
 
 
 
 
 
 
 
 
 
9b63c71
67b8f64
 
08f1dd3
 
 
e13e21e
67b8f64
45e2824
 
 
3662a0e
a9ac3b9
d7c677a
e13e21e
3662a0e
67b8f64
a97b4db
3662a0e
cbb8031
d7c677a
e13e21e
3662a0e
67b8f64
fb5b190
67b8f64
f73ba75
d7c677a
fb5b190
08f1dd3
67b8f64
 
 
 
 
 
 
 
08f1dd3
d7c677a
 
a97b4db
67b8f64
a97b4db
 
e13e21e
d7c677a
a97b4db
d7c677a
08f1dd3
d7c677a
 
cbb8031
d7c677a
e13e21e
67b8f64
08f1dd3
d7c677a
51ce54e
 
67b8f64
d7c677a
 
 
 
 
67b8f64
 
 
d7c677a
 
 
67b8f64
 
 
d7c677a
 
 
0eced5a
a97b4db
 
67b8f64
 
 
 
 
 
d7c677a
 
 
a97b4db
d7c677a

import gradio as gr
from gradio import update
import stanza
import pandas as pd
import requests
import traceback
from pathlib import Path

# 1. MODEL VARIANTS & INITIALIZATION

LESBIAN_MODELS = {}
MODEL_VARIANTS = {
    "Lesbian-only (UD_Greek-Lesbian)":           "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
    "Lesbian-augmented (UD_Greek-Lesbian+NGUD)": "sbompolas/NGUD-Lesbian-Morphosyntactic-Model",
    "Standard Modern Greek (UD_Greek-GUD)":      "sbompolas/GUD",
    "Cretan-only (UD_Greek-Cretan)":      "sbompolas/Cretan"
}

def download_model_file(url, filename):
    try:
        resp = requests.get(url, stream=True)
        resp.raise_for_status()
        with open(filename, "wb") as f:
            for chunk in resp.iter_content(8192):
                f.write(chunk)
        return True
    except Exception as e:
        print(f"Download failed {filename}: {e}")
        return False

def initialize_models():
    try:
        base = Path("./models")
        base.mkdir(exist_ok=True)
        for name, repo in MODEL_VARIANTS.items():
            out = base / name
            out.mkdir(exist_ok=True)
            files = {
                "tokenizer.pt":  f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
                "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
                "pos.pt":        f"https://huggingface.co/{repo}/resolve/main/pos.pt",
                "depparse.pt":   f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
            }
            for fn, url in files.items():
                tgt = out / fn
                if not tgt.exists() and not download_model_file(url, str(tgt)):
                    return False, f"Failed to download {fn} for {name}"
            cfg = {
                'processors': 'tokenize,pos,lemma,depparse',
                'lang': 'el',
                'use_gpu': False,
                'verbose': False,
                'tokenize_model_path': str(out/"tokenizer.pt"),
                'pos_model_path':      str(out/"pos.pt"),
                'lemma_model_path':    str(out/"lemmatizer.pt"),
                'depparse_model_path': str(out/"depparse.pt")
            }
            LESBIAN_MODELS[name] = stanza.Pipeline(**cfg)
        return True, "Models loaded"
    except Exception as e:
        traceback.print_exc()
        return False, str(e)

loaded, load_status = initialize_models()


# 2. CONLL-U / DATAFRAME

def stanza_doc_to_conllu(doc) -> str:
    lines = []
    for sid, sent in enumerate(doc.sentences, 1):
        lines.append(f"# sent_id = {sid}")
        lines.append(f"# text = {sent.text}")
        for w in sent.words:
            fields = [
                str(w.id), w.text,
                w.lemma or "_", w.upos or "_",
                w.xpos or "_", w.feats or "_",
                str(w.head) if w.head is not None else "0",
                w.deprel or "_", "_", "_"
            ]
            lines.append("\t".join(fields))
        lines.append("")  # blank line after each sentence
    return "\n".join(lines)

def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
    cols = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC']
    rows = []
    for line in conllu.splitlines():
        if not line:
            # empty separator row
            rows.append({c: "" for c in cols})
            continue
        if line.startswith("#"):
            key, val = line[2:].split("=", 1)
            key, val = key.strip(), val.strip()
            if key == "sent_id":
                rows.append({'ID': f"# sent_id = {val}", 'FORM': ""})
            elif key == "text":
                rows.append({'ID': f"# text = {val}", 'FORM': ""})
            continue
        parts = line.split("\t")
        if len(parts) >= 10:
            rows.append(dict(zip(cols, parts)))
    return pd.DataFrame(rows, columns=cols).fillna("")


# 3. FULL SVG BUILDER (crop top + bottom padding, arrows at start)

def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
    try:
        df = pd.DataFrame(sentence_data) if isinstance(sentence_data, list) else sentence_data
        n = len(df)
        base_w, min_sp = 100, 30
        spacing = max(base_w, (n*base_w + (n-1)*min_sp)/n)
        width = max(800, n*spacing + 100)
        orig_height = 500
        crop_top   = 30  # px to remove from top
        bottom_pad = 30  # px to add at bottom
        height     = orig_height - crop_top + bottom_pad

        word_y   = height - 120
        feats_y  = word_y + 35

        colors = {
            'root':'#000000','nsubj':'#2980b9','obj':'#27ae60','det':'#e67e22',
            'amod':'#8e44ad','nmod':'#16a085','case':'#34495e','punct':'#7f8c8d',
            'cc':'#d35400','conj':'#2c3e50','cop':'#e74c3c','mark':'#9b59b6',
            'csubj':'#3498db','xcomp':'#1abc9c','ccomp':'#f39c12','advcl':'#e91e63',
            'advmod':'#9c27b0','obl':'#795548','iobj':'#607d8b','fixed':'#ff5722',
            'aux':'#ff9800','acl':'#4caf50','appos':'#673ab7','compound':'#009688'
        }

        svg = [
            f'<svg width="{width}" height="{height}" viewBox="0 {crop_top} {width} {orig_height}" '
            'xmlns="http://www.w3.org/2000/svg" style="background:white;border:1px solid #eee;"><defs>'
        ]
        for rel, c in colors.items():
            svg.append(
                f'<marker id="m_{rel}" markerWidth="4" markerHeight="4" '
                'markerUnits="userSpaceOnUse" orient="auto-start-reverse" refX="3.5" refY="2">'
                f'<path d="M0,0 L4,2 L0,4Z" fill="{c}"/></marker>'
            )
        svg.append('</defs><g>')

        # compute x positions
        xpos = {
            int(r['ID']): 50 + (int(r['ID']) - 1) * spacing
            for _, r in df.iterrows() if str(r['ID']).isdigit()
        }

        used_spans = []
        for _, r in df.iterrows():
            if not str(r['ID']).isdigit(): 
                continue
            i, h = int(r['ID']), int(r['HEAD'])
            rel, c = r['DEPREL'], colors.get(r['DEPREL'], '#000')
            x1 = xpos[i]
            if h == 0:
                # ROOT line
                svg.append(
                    f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
                    f'stroke="{c}" stroke-width="1.5"/>'
                )
                mid = (word_y-15 + 50) / 2
                svg.append(
                    f'<rect x="{x1-15}" y="{mid-8}" width="30" height="14" '
                    f'fill="white" stroke="{c}" rx="2"/>'
                )
                svg.append(
                    f'<text x="{x1}" y="{mid+2}" text-anchor="middle" '
                    f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">ROOT</text>'
                )
            else:
                x2 = xpos.get(h, x1)
                span = (min(i, h), max(i, h))
                lvl = 0
                conflict = True
                while conflict:
                    conflict = False
                    for (es, el), used_lvl in used_spans:
                        if used_lvl == lvl and not (span[1] < es or span[0] > el):
                            lvl += 1
                            conflict = True
                            break
                used_spans.append((span, lvl))
                dist = abs(x2 - x1)
                arc_h = min(40 + dist * 0.15, 100) + lvl * 35
                midx, cty = (x1 + x2) / 2, word_y - arc_h
                path_d = f'M{x1} {word_y-15} Q{midx} {cty} {x2} {word_y-15}'
                svg.append(
                    f'<path d="{path_d}" stroke="{c}" fill="none" stroke-width="1.5" '
                    f'marker-start="url(#m_{rel})"/>'
                )
                amx = 0.25*x1 + 0.5*midx + 0.25*x2
                amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15)
                lw = len(rel)*6 + 8
                svg.append(
                    f'<rect x="{amx-lw/2}" y="{amy-8}" width="{lw}" height="14" '
                    f'fill="white" stroke="{c}" rx="2"/>'
                )
                svg.append(
                    f'<text x="{amx}" y="{amy+2}" text-anchor="middle" '
                    f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">{rel}</text>'
                )

        # draw tokens + annotations
        for _, r in df.iterrows():
            if not str(r['ID']).isdigit():
                continue
            x = xpos[int(r['ID'])]
            svg.append(
                f'<text x="{x}" y="{word_y}" text-anchor="middle" '
                f'font-family="Arial" font-size="13" font-weight="bold">{r["FORM"]}</text>'
            )
            ann = []
            if r['UPOS'] and r['UPOS'] != '_': ann.append(f"upos={r['UPOS']}")
            if r['LEMMA'] not in ('_', r['FORM']): ann.append(f"lemma={r['LEMMA']}")
            if r['FEATS'] and r['FEATS'] not in ('', '_'):
                for f in r['FEATS'].split('|'):
                    if '=' in f:
                        ann.append(f)
            for i, a in enumerate(ann):
                svg.append(
                    f'<text x="{x}" y="{feats_y + i*12}" text-anchor="middle" '
                    f'font-family="Arial" font-size="7" fill="#666">{a}</text>'
                )

        svg.append('</g></svg>')
        return "".join(svg)
    except Exception as e:
        return f"<p>Error creating SVG: {e}</p>"


# 4. PROCESS & DROPDOWN-UPDATES

def process_text(text, variant):
    if not text.strip():
        return (
            gr.HTML.update(value="<p>No data</p>"),
            gr.Dropdown.update(choices=[], value=None),
            [], "", pd.DataFrame()
        )
    pipe = LESBIAN_MODELS.get(variant)
    if pipe is None:
        return (
            gr.HTML.update(value="<p>Error: model not loaded</p>"),
            gr.Dropdown.update(choices=[], value=None),
            [], "", pd.DataFrame()
        )
    doc = pipe(text)
    conllu = stanza_doc_to_conllu(doc)
    df = conllu_to_dataframe(conllu)

    sentences = []
    for sent in doc.sentences:
        payload = [{
            'ID':     w.id,
            'FORM':   w.text,
            'LEMMA':  w.lemma or "_",
            'UPOS':   w.upos or "_",
            'XPOS':   w.xpos or "_",
            'FEATS':  w.feats or "_",
            'HEAD':   w.head or 0,
            'DEPREL': w.deprel or "_"
        } for w in sent.words]
        sentences.append(payload)

    sent_ids = [str(i+1) for i in range(len(sentences))]
    dd_upd = update(choices=sent_ids, value=sent_ids[0] if sent_ids else None)
    init_svg = create_single_sentence_svg(sentences[0]) if sentences else "<p>No data</p>"

    return init_svg, dd_upd, sentences, conllu, df

def update_svg(selected_id, sentences):
    try:
        idx = int(selected_id) - 1
        return create_single_sentence_svg(sentences[idx])
    except:
        return "<p>Invalid selection</p>"


# 5. BUILD GRADIO UI

def create_app():
    with gr.Blocks(title="Parser for MG Dialects") as app:
        gr.Markdown("# Morphosyntactic Parser for MG Dialects")

        if not loaded:
            gr.Markdown(f"❌ Load error: {load_status}")

        with gr.Row():
            with gr.Column():
                txt = gr.Textbox(label="Input Text", lines=4, placeholder="Εισάγετε κείμενο…")
                mdl = gr.Radio(choices=list(MODEL_VARIANTS.keys()), value="Lesbian-only", label="Model Variant")
                btn = gr.Button("Parse", variant="primary")

        with gr.Row():
            with gr.Column():
                svg_out      = gr.HTML("<p>No visualization</p>")
                sentence_dd  = gr.Dropdown(label="Choose sentence", choices=[])
                sentences_st = gr.State([])

        with gr.Row():
            with gr.Column():
                conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True)
                table_out  = gr.Dataframe(label="Token Table")

        btn.click(
            fn=process_text,
            inputs=[txt, mdl],
            outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out]
        )
        sentence_dd.change(fn=update_svg, inputs=[sentence_dd, sentences_st], outputs=svg_out)

    return app

if __name__ == "__main__":
    create_app().launch()