|
|
import gradio as gr |
|
|
from gradio import update |
|
|
import stanza |
|
|
import pandas as pd |
|
|
import requests |
|
|
import traceback |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
|
|
|
LESBIAN_MODELS = {} |
|
|
MODEL_VARIANTS = { |
|
|
"Lesbian-only (UD_Greek-Lesbian)": "sbompolas/Lesbian-Greek-Morphosyntactic-Model", |
|
|
"Lesbian-augmented (UD_Greek-Lesbian+NGUD)": "sbompolas/NGUD-Lesbian-Morphosyntactic-Model", |
|
|
"Standard Modern Greek (UD_Greek-GUD)": "sbompolas/GUD", |
|
|
"Cretan-only (UD_Greek-Cretan)": "sbompolas/Cretan" |
|
|
} |
|
|
|
|
|
def download_model_file(url, filename): |
|
|
try: |
|
|
resp = requests.get(url, stream=True) |
|
|
resp.raise_for_status() |
|
|
with open(filename, "wb") as f: |
|
|
for chunk in resp.iter_content(8192): |
|
|
f.write(chunk) |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"Download failed {filename}: {e}") |
|
|
return False |
|
|
|
|
|
def initialize_models(): |
|
|
try: |
|
|
base = Path("./models") |
|
|
base.mkdir(exist_ok=True) |
|
|
for name, repo in MODEL_VARIANTS.items(): |
|
|
out = base / name |
|
|
out.mkdir(exist_ok=True) |
|
|
files = { |
|
|
"tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt", |
|
|
"lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt", |
|
|
"pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt", |
|
|
"depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt", |
|
|
} |
|
|
for fn, url in files.items(): |
|
|
tgt = out / fn |
|
|
if not tgt.exists() and not download_model_file(url, str(tgt)): |
|
|
return False, f"Failed to download {fn} for {name}" |
|
|
cfg = { |
|
|
'processors': 'tokenize,pos,lemma,depparse', |
|
|
'lang': 'el', |
|
|
'use_gpu': False, |
|
|
'verbose': False, |
|
|
'tokenize_model_path': str(out/"tokenizer.pt"), |
|
|
'pos_model_path': str(out/"pos.pt"), |
|
|
'lemma_model_path': str(out/"lemmatizer.pt"), |
|
|
'depparse_model_path': str(out/"depparse.pt") |
|
|
} |
|
|
LESBIAN_MODELS[name] = stanza.Pipeline(**cfg) |
|
|
return True, "Models loaded" |
|
|
except Exception as e: |
|
|
traceback.print_exc() |
|
|
return False, str(e) |
|
|
|
|
|
loaded, load_status = initialize_models() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def stanza_doc_to_conllu(doc) -> str: |
|
|
lines = [] |
|
|
for sid, sent in enumerate(doc.sentences, 1): |
|
|
lines.append(f"# sent_id = {sid}") |
|
|
lines.append(f"# text = {sent.text}") |
|
|
for w in sent.words: |
|
|
fields = [ |
|
|
str(w.id), w.text, |
|
|
w.lemma or "_", w.upos or "_", |
|
|
w.xpos or "_", w.feats or "_", |
|
|
str(w.head) if w.head is not None else "0", |
|
|
w.deprel or "_", "_", "_" |
|
|
] |
|
|
lines.append("\t".join(fields)) |
|
|
lines.append("") |
|
|
return "\n".join(lines) |
|
|
|
|
|
def conllu_to_dataframe(conllu: str) -> pd.DataFrame: |
|
|
cols = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC'] |
|
|
rows = [] |
|
|
for line in conllu.splitlines(): |
|
|
if not line: |
|
|
|
|
|
rows.append({c: "" for c in cols}) |
|
|
continue |
|
|
if line.startswith("#"): |
|
|
key, val = line[2:].split("=", 1) |
|
|
key, val = key.strip(), val.strip() |
|
|
if key == "sent_id": |
|
|
rows.append({'ID': f"# sent_id = {val}", 'FORM': ""}) |
|
|
elif key == "text": |
|
|
rows.append({'ID': f"# text = {val}", 'FORM': ""}) |
|
|
continue |
|
|
parts = line.split("\t") |
|
|
if len(parts) >= 10: |
|
|
rows.append(dict(zip(cols, parts))) |
|
|
return pd.DataFrame(rows, columns=cols).fillna("") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1): |
|
|
try: |
|
|
df = pd.DataFrame(sentence_data) if isinstance(sentence_data, list) else sentence_data |
|
|
n = len(df) |
|
|
base_w, min_sp = 100, 30 |
|
|
spacing = max(base_w, (n*base_w + (n-1)*min_sp)/n) |
|
|
width = max(800, n*spacing + 100) |
|
|
orig_height = 500 |
|
|
crop_top = 30 |
|
|
bottom_pad = 30 |
|
|
height = orig_height - crop_top + bottom_pad |
|
|
|
|
|
word_y = height - 120 |
|
|
feats_y = word_y + 35 |
|
|
|
|
|
colors = { |
|
|
'root':'#000000','nsubj':'#2980b9','obj':'#27ae60','det':'#e67e22', |
|
|
'amod':'#8e44ad','nmod':'#16a085','case':'#34495e','punct':'#7f8c8d', |
|
|
'cc':'#d35400','conj':'#2c3e50','cop':'#e74c3c','mark':'#9b59b6', |
|
|
'csubj':'#3498db','xcomp':'#1abc9c','ccomp':'#f39c12','advcl':'#e91e63', |
|
|
'advmod':'#9c27b0','obl':'#795548','iobj':'#607d8b','fixed':'#ff5722', |
|
|
'aux':'#ff9800','acl':'#4caf50','appos':'#673ab7','compound':'#009688' |
|
|
} |
|
|
|
|
|
svg = [ |
|
|
f'<svg width="{width}" height="{height}" viewBox="0 {crop_top} {width} {orig_height}" ' |
|
|
'xmlns="http://www.w3.org/2000/svg" style="background:white;border:1px solid #eee;"><defs>' |
|
|
] |
|
|
for rel, c in colors.items(): |
|
|
svg.append( |
|
|
f'<marker id="m_{rel}" markerWidth="4" markerHeight="4" ' |
|
|
'markerUnits="userSpaceOnUse" orient="auto-start-reverse" refX="3.5" refY="2">' |
|
|
f'<path d="M0,0 L4,2 L0,4Z" fill="{c}"/></marker>' |
|
|
) |
|
|
svg.append('</defs><g>') |
|
|
|
|
|
|
|
|
xpos = { |
|
|
int(r['ID']): 50 + (int(r['ID']) - 1) * spacing |
|
|
for _, r in df.iterrows() if str(r['ID']).isdigit() |
|
|
} |
|
|
|
|
|
used_spans = [] |
|
|
for _, r in df.iterrows(): |
|
|
if not str(r['ID']).isdigit(): |
|
|
continue |
|
|
i, h = int(r['ID']), int(r['HEAD']) |
|
|
rel, c = r['DEPREL'], colors.get(r['DEPREL'], '#000') |
|
|
x1 = xpos[i] |
|
|
if h == 0: |
|
|
|
|
|
svg.append( |
|
|
f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" ' |
|
|
f'stroke="{c}" stroke-width="1.5"/>' |
|
|
) |
|
|
mid = (word_y-15 + 50) / 2 |
|
|
svg.append( |
|
|
f'<rect x="{x1-15}" y="{mid-8}" width="30" height="14" ' |
|
|
f'fill="white" stroke="{c}" rx="2"/>' |
|
|
) |
|
|
svg.append( |
|
|
f'<text x="{x1}" y="{mid+2}" text-anchor="middle" ' |
|
|
f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">ROOT</text>' |
|
|
) |
|
|
else: |
|
|
x2 = xpos.get(h, x1) |
|
|
span = (min(i, h), max(i, h)) |
|
|
lvl = 0 |
|
|
conflict = True |
|
|
while conflict: |
|
|
conflict = False |
|
|
for (es, el), used_lvl in used_spans: |
|
|
if used_lvl == lvl and not (span[1] < es or span[0] > el): |
|
|
lvl += 1 |
|
|
conflict = True |
|
|
break |
|
|
used_spans.append((span, lvl)) |
|
|
dist = abs(x2 - x1) |
|
|
arc_h = min(40 + dist * 0.15, 100) + lvl * 35 |
|
|
midx, cty = (x1 + x2) / 2, word_y - arc_h |
|
|
path_d = f'M{x1} {word_y-15} Q{midx} {cty} {x2} {word_y-15}' |
|
|
svg.append( |
|
|
f'<path d="{path_d}" stroke="{c}" fill="none" stroke-width="1.5" ' |
|
|
f'marker-start="url(#m_{rel})"/>' |
|
|
) |
|
|
amx = 0.25*x1 + 0.5*midx + 0.25*x2 |
|
|
amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15) |
|
|
lw = len(rel)*6 + 8 |
|
|
svg.append( |
|
|
f'<rect x="{amx-lw/2}" y="{amy-8}" width="{lw}" height="14" ' |
|
|
f'fill="white" stroke="{c}" rx="2"/>' |
|
|
) |
|
|
svg.append( |
|
|
f'<text x="{amx}" y="{amy+2}" text-anchor="middle" ' |
|
|
f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">{rel}</text>' |
|
|
) |
|
|
|
|
|
|
|
|
for _, r in df.iterrows(): |
|
|
if not str(r['ID']).isdigit(): |
|
|
continue |
|
|
x = xpos[int(r['ID'])] |
|
|
svg.append( |
|
|
f'<text x="{x}" y="{word_y}" text-anchor="middle" ' |
|
|
f'font-family="Arial" font-size="13" font-weight="bold">{r["FORM"]}</text>' |
|
|
) |
|
|
ann = [] |
|
|
if r['UPOS'] and r['UPOS'] != '_': ann.append(f"upos={r['UPOS']}") |
|
|
if r['LEMMA'] not in ('_', r['FORM']): ann.append(f"lemma={r['LEMMA']}") |
|
|
if r['FEATS'] and r['FEATS'] not in ('', '_'): |
|
|
for f in r['FEATS'].split('|'): |
|
|
if '=' in f: |
|
|
ann.append(f) |
|
|
for i, a in enumerate(ann): |
|
|
svg.append( |
|
|
f'<text x="{x}" y="{feats_y + i*12}" text-anchor="middle" ' |
|
|
f'font-family="Arial" font-size="7" fill="#666">{a}</text>' |
|
|
) |
|
|
|
|
|
svg.append('</g></svg>') |
|
|
return "".join(svg) |
|
|
except Exception as e: |
|
|
return f"<p>Error creating SVG: {e}</p>" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_text(text, variant): |
|
|
if not text.strip(): |
|
|
return ( |
|
|
gr.HTML.update(value="<p>No data</p>"), |
|
|
gr.Dropdown.update(choices=[], value=None), |
|
|
[], "", pd.DataFrame() |
|
|
) |
|
|
pipe = LESBIAN_MODELS.get(variant) |
|
|
if pipe is None: |
|
|
return ( |
|
|
gr.HTML.update(value="<p>Error: model not loaded</p>"), |
|
|
gr.Dropdown.update(choices=[], value=None), |
|
|
[], "", pd.DataFrame() |
|
|
) |
|
|
doc = pipe(text) |
|
|
conllu = stanza_doc_to_conllu(doc) |
|
|
df = conllu_to_dataframe(conllu) |
|
|
|
|
|
sentences = [] |
|
|
for sent in doc.sentences: |
|
|
payload = [{ |
|
|
'ID': w.id, |
|
|
'FORM': w.text, |
|
|
'LEMMA': w.lemma or "_", |
|
|
'UPOS': w.upos or "_", |
|
|
'XPOS': w.xpos or "_", |
|
|
'FEATS': w.feats or "_", |
|
|
'HEAD': w.head or 0, |
|
|
'DEPREL': w.deprel or "_" |
|
|
} for w in sent.words] |
|
|
sentences.append(payload) |
|
|
|
|
|
sent_ids = [str(i+1) for i in range(len(sentences))] |
|
|
dd_upd = update(choices=sent_ids, value=sent_ids[0] if sent_ids else None) |
|
|
init_svg = create_single_sentence_svg(sentences[0]) if sentences else "<p>No data</p>" |
|
|
|
|
|
return init_svg, dd_upd, sentences, conllu, df |
|
|
|
|
|
def update_svg(selected_id, sentences): |
|
|
try: |
|
|
idx = int(selected_id) - 1 |
|
|
return create_single_sentence_svg(sentences[idx]) |
|
|
except: |
|
|
return "<p>Invalid selection</p>" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_app(): |
|
|
with gr.Blocks(title="Parser for MG Dialects") as app: |
|
|
gr.Markdown("# Morphosyntactic Parser for MG Dialects") |
|
|
|
|
|
if not loaded: |
|
|
gr.Markdown(f"❌ Load error: {load_status}") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
txt = gr.Textbox(label="Input Text", lines=4, placeholder="Εισάγετε κείμενο…") |
|
|
mdl = gr.Radio(choices=list(MODEL_VARIANTS.keys()), value="Lesbian-only", label="Model Variant") |
|
|
btn = gr.Button("Parse", variant="primary") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
svg_out = gr.HTML("<p>No visualization</p>") |
|
|
sentence_dd = gr.Dropdown(label="Choose sentence", choices=[]) |
|
|
sentences_st = gr.State([]) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True) |
|
|
table_out = gr.Dataframe(label="Token Table") |
|
|
|
|
|
btn.click( |
|
|
fn=process_text, |
|
|
inputs=[txt, mdl], |
|
|
outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out] |
|
|
) |
|
|
sentence_dd.change(fn=update_svg, inputs=[sentence_dd, sentences_st], outputs=svg_out) |
|
|
|
|
|
return app |
|
|
|
|
|
if __name__ == "__main__": |
|
|
create_app().launch() |
|
|
|