Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

App Files Files Community

Lesbian-morphosyntactic-parsing / app.py

sbompolas

Update app.py

51ce54e verified about 1 month ago

raw

history blame contribute delete

12.3 kB

	import gradio as gr
	from gradio import update
	import stanza
	import pandas as pd
	import requests
	import traceback
	from pathlib import Path

	# 1. MODEL VARIANTS & INITIALIZATION

	LESBIAN_MODELS = {}
	MODEL_VARIANTS = {
	"Lesbian-only (UD_Greek-Lesbian)": "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
	"Lesbian-augmented (UD_Greek-Lesbian+NGUD)": "sbompolas/NGUD-Lesbian-Morphosyntactic-Model",
	"Standard Modern Greek (UD_Greek-GUD)": "sbompolas/GUD",
	"Cretan-only (UD_Greek-Cretan)": "sbompolas/Cretan"
	}

	def download_model_file(url, filename):
	try:
	resp = requests.get(url, stream=True)
	resp.raise_for_status()
	with open(filename, "wb") as f:
	for chunk in resp.iter_content(8192):
	f.write(chunk)
	return True
	except Exception as e:
	print(f"Download failed {filename}: {e}")
	return False

	def initialize_models():
	try:
	base = Path("./models")
	base.mkdir(exist_ok=True)
	for name, repo in MODEL_VARIANTS.items():
	out = base / name
	out.mkdir(exist_ok=True)
	files = {
	"tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
	"lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
	"pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
	"depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
	}
	for fn, url in files.items():
	tgt = out / fn
	if not tgt.exists() and not download_model_file(url, str(tgt)):
	return False, f"Failed to download {fn} for {name}"
	cfg = {
	'processors': 'tokenize,pos,lemma,depparse',
	'lang': 'el',
	'use_gpu': False,
	'verbose': False,
	'tokenize_model_path': str(out/"tokenizer.pt"),
	'pos_model_path': str(out/"pos.pt"),
	'lemma_model_path': str(out/"lemmatizer.pt"),
	'depparse_model_path': str(out/"depparse.pt")
	}
	LESBIAN_MODELS[name] = stanza.Pipeline(**cfg)
	return True, "Models loaded"
	except Exception as e:
	traceback.print_exc()
	return False, str(e)

	loaded, load_status = initialize_models()


	# 2. CONLL-U / DATAFRAME

	def stanza_doc_to_conllu(doc) -> str:
	lines = []
	for sid, sent in enumerate(doc.sentences, 1):
	lines.append(f"# sent_id = {sid}")
	lines.append(f"# text = {sent.text}")
	for w in sent.words:
	fields = [
	str(w.id), w.text,
	w.lemma or "_", w.upos or "_",
	w.xpos or "_", w.feats or "_",
	str(w.head) if w.head is not None else "0",
	w.deprel or "_", "_", "_"
	]
	lines.append("\t".join(fields))
	lines.append("") # blank line after each sentence
	return "\n".join(lines)

	def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
	cols = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC']
	rows = []
	for line in conllu.splitlines():
	if not line:
	# empty separator row
	rows.append({c: "" for c in cols})
	continue
	if line.startswith("#"):
	key, val = line[2:].split("=", 1)
	key, val = key.strip(), val.strip()
	if key == "sent_id":
	rows.append({'ID': f"# sent_id = {val}", 'FORM': ""})
	elif key == "text":
	rows.append({'ID': f"# text = {val}", 'FORM': ""})
	continue
	parts = line.split("\t")
	if len(parts) >= 10:
	rows.append(dict(zip(cols, parts)))
	return pd.DataFrame(rows, columns=cols).fillna("")


	# 3. FULL SVG BUILDER (crop top + bottom padding, arrows at start)

	def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
	try:
	df = pd.DataFrame(sentence_data) if isinstance(sentence_data, list) else sentence_data
	n = len(df)
	base_w, min_sp = 100, 30
	spacing = max(base_w, (nbase_w + (n-1)min_sp)/n)
	width = max(800, n*spacing + 100)
	orig_height = 500
	crop_top = 30 # px to remove from top
	bottom_pad = 30 # px to add at bottom
	height = orig_height - crop_top + bottom_pad

	word_y = height - 120
	feats_y = word_y + 35

	colors = {
	'root':'#000000','nsubj':'#2980b9','obj':'#27ae60','det':'#e67e22',
	'amod':'#8e44ad','nmod':'#16a085','case':'#34495e','punct':'#7f8c8d',
	'cc':'#d35400','conj':'#2c3e50','cop':'#e74c3c','mark':'#9b59b6',
	'csubj':'#3498db','xcomp':'#1abc9c','ccomp':'#f39c12','advcl':'#e91e63',
	'advmod':'#9c27b0','obl':'#795548','iobj':'#607d8b','fixed':'#ff5722',
	'aux':'#ff9800','acl':'#4caf50','appos':'#673ab7','compound':'#009688'
	}

	svg = [
	f'<svg width="{width}" height="{height}" viewBox="0 {crop_top} {width} {orig_height}" '
	'xmlns="http://www.w3.org/2000/svg" style="background:white;border:1px solid #eee;"><defs>'
	]
	for rel, c in colors.items():
	svg.append(
	f'<marker id="m_{rel}" markerWidth="4" markerHeight="4" '
	'markerUnits="userSpaceOnUse" orient="auto-start-reverse" refX="3.5" refY="2">'
	f'<path d="M0,0 L4,2 L0,4Z" fill="{c}"/></marker>'
	)
	svg.append('</defs><g>')

	# compute x positions
	xpos = {
	int(r['ID']): 50 + (int(r['ID']) - 1) * spacing
	for _, r in df.iterrows() if str(r['ID']).isdigit()
	}

	used_spans = []
	for _, r in df.iterrows():
	if not str(r['ID']).isdigit():
	continue
	i, h = int(r['ID']), int(r['HEAD'])
	rel, c = r['DEPREL'], colors.get(r['DEPREL'], '#000')
	x1 = xpos[i]
	if h == 0:
	# ROOT line
	svg.append(
	f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
	f'stroke="{c}" stroke-width="1.5"/>'
	)
	mid = (word_y-15 + 50) / 2
	svg.append(
	f'<rect x="{x1-15}" y="{mid-8}" width="30" height="14" '
	f'fill="white" stroke="{c}" rx="2"/>'
	)
	svg.append(
	f'<text x="{x1}" y="{mid+2}" text-anchor="middle" '
	f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">ROOT</text>'
	)
	else:
	x2 = xpos.get(h, x1)
	span = (min(i, h), max(i, h))
	lvl = 0
	conflict = True
	while conflict:
	conflict = False
	for (es, el), used_lvl in used_spans:
	if used_lvl == lvl and not (span[1] < es or span[0] > el):
	lvl += 1
	conflict = True
	break
	used_spans.append((span, lvl))
	dist = abs(x2 - x1)
	arc_h = min(40 + dist * 0.15, 100) + lvl * 35
	midx, cty = (x1 + x2) / 2, word_y - arc_h
	path_d = f'M{x1} {word_y-15} Q{midx} {cty} {x2} {word_y-15}'
	svg.append(
	f'<path d="{path_d}" stroke="{c}" fill="none" stroke-width="1.5" '
	f'marker-start="url(#m_{rel})"/>'
	)
	amx = 0.25x1 + 0.5midx + 0.25*x2
	amy = 0.25(word_y-15) + 0.5cty + 0.25*(word_y-15)
	lw = len(rel)*6 + 8
	svg.append(
	f'<rect x="{amx-lw/2}" y="{amy-8}" width="{lw}" height="14" '
	f'fill="white" stroke="{c}" rx="2"/>'
	)
	svg.append(
	f'<text x="{amx}" y="{amy+2}" text-anchor="middle" '
	f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">{rel}</text>'
	)

	# draw tokens + annotations
	for _, r in df.iterrows():
	if not str(r['ID']).isdigit():
	continue
	x = xpos[int(r['ID'])]
	svg.append(
	f'<text x="{x}" y="{word_y}" text-anchor="middle" '
	f'font-family="Arial" font-size="13" font-weight="bold">{r["FORM"]}</text>'
	)
	ann = []
	if r['UPOS'] and r['UPOS'] != '_': ann.append(f"upos={r['UPOS']}")
	if r['LEMMA'] not in ('_', r['FORM']): ann.append(f"lemma={r['LEMMA']}")
	if r['FEATS'] and r['FEATS'] not in ('', '_'):
	for f in r['FEATS'].split('\|'):
	if '=' in f:
	ann.append(f)
	for i, a in enumerate(ann):
	svg.append(
	f'<text x="{x}" y="{feats_y + i*12}" text-anchor="middle" '
	f'font-family="Arial" font-size="7" fill="#666">{a}</text>'
	)

	svg.append('</g></svg>')
	return "".join(svg)
	except Exception as e:
	return f"<p>Error creating SVG: {e}</p>"


	# 4. PROCESS & DROPDOWN-UPDATES

	def process_text(text, variant):
	if not text.strip():
	return (
	gr.HTML.update(value="<p>No data</p>"),
	gr.Dropdown.update(choices=[], value=None),
	[], "", pd.DataFrame()
	)
	pipe = LESBIAN_MODELS.get(variant)
	if pipe is None:
	return (
	gr.HTML.update(value="<p>Error: model not loaded</p>"),
	gr.Dropdown.update(choices=[], value=None),
	[], "", pd.DataFrame()
	)
	doc = pipe(text)
	conllu = stanza_doc_to_conllu(doc)
	df = conllu_to_dataframe(conllu)

	sentences = []
	for sent in doc.sentences:
	payload = [{
	'ID': w.id,
	'FORM': w.text,
	'LEMMA': w.lemma or "_",
	'UPOS': w.upos or "_",
	'XPOS': w.xpos or "_",
	'FEATS': w.feats or "_",
	'HEAD': w.head or 0,
	'DEPREL': w.deprel or "_"
	} for w in sent.words]
	sentences.append(payload)

	sent_ids = [str(i+1) for i in range(len(sentences))]
	dd_upd = update(choices=sent_ids, value=sent_ids[0] if sent_ids else None)
	init_svg = create_single_sentence_svg(sentences[0]) if sentences else "<p>No data</p>"

	return init_svg, dd_upd, sentences, conllu, df

	def update_svg(selected_id, sentences):
	try:
	idx = int(selected_id) - 1
	return create_single_sentence_svg(sentences[idx])
	except:
	return "<p>Invalid selection</p>"


	# 5. BUILD GRADIO UI

	def create_app():
	with gr.Blocks(title="Parser for MG Dialects") as app:
	gr.Markdown("# Morphosyntactic Parser for MG Dialects")

	if not loaded:
	gr.Markdown(f"❌ Load error: {load_status}")

	with gr.Row():
	with gr.Column():
	txt = gr.Textbox(label="Input Text", lines=4, placeholder="Εισάγετε κείμενο…")
	mdl = gr.Radio(choices=list(MODEL_VARIANTS.keys()), value="Lesbian-only", label="Model Variant")
	btn = gr.Button("Parse", variant="primary")

	with gr.Row():
	with gr.Column():
	svg_out = gr.HTML("<p>No visualization</p>")
	sentence_dd = gr.Dropdown(label="Choose sentence", choices=[])
	sentences_st = gr.State([])

	with gr.Row():
	with gr.Column():
	conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True)
	table_out = gr.Dataframe(label="Token Table")

	btn.click(
	fn=process_text,
	inputs=[txt, mdl],
	outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out]
	)
	sentence_dd.change(fn=update_svg, inputs=[sentence_dd, sentences_st], outputs=svg_out)

	return app

	if __name__ == "__main__":
	create_app().launch()