File size: 12,268 Bytes
9391975
45e2824
defdd5b
fcb6cc9
8c61d1b
6394ec4
8c61d1b
a600f21
08f1dd3
 
67b8f64
45e2824
a567e39
 
36170de
51ce54e
45e2824
9a96586
67b8f64
fe64a8e
6394ec4
 
45e2824
6394ec4
fe64a8e
 
 
67b8f64
fe64a8e
 
45e2824
fe64a8e
e4bdfd5
 
67b8f64
 
45e2824
 
67b8f64
 
 
 
45e2824
cbb8031
4670ce3
45e2824
67b8f64
45e2824
 
a97b4db
 
 
45e2824
 
 
67b8f64
45e2824
67b8f64
45e2824
fe64a8e
 
 
9a96586
45e2824
3662a0e
e13e21e
67b8f64
08f1dd3
96e10be
fb5b190
e4bdfd5
fb5b190
6394ec4
 
defdd5b
67b8f64
 
f73ba75
67b8f64
e4bdfd5
defdd5b
fb5b190
67b8f64
fb5b190
96e10be
103058d
4670ce3
cbb8031
08f1dd3
67b8f64
 
4670ce3
383a058
08f1dd3
e13e21e
 
 
 
 
 
cbb8031
08f1dd3
a97b4db
e13e21e
 
 
a9ac3b9
67b8f64
08f1dd3
 
 
 
e13e21e
 
 
 
67b8f64
 
 
 
e13e21e
67b8f64
 
e13e21e
 
67b8f64
e13e21e
 
 
 
 
08f1dd3
 
 
67b8f64
 
08f1dd3
e13e21e
08f1dd3
e13e21e
0eced5a
e13e21e
08f1dd3
67b8f64
08f1dd3
67b8f64
e13e21e
67b8f64
 
4670ce3
08f1dd3
 
e13e21e
67b8f64
 
e13e21e
 
 
 
67b8f64
 
 
 
 
 
 
 
 
 
 
 
 
 
08f1dd3
e13e21e
67b8f64
 
08f1dd3
 
 
67b8f64
 
08f1dd3
 
 
67b8f64
 
 
 
 
08f1dd3
67b8f64
e13e21e
08f1dd3
 
67b8f64
 
 
 
 
 
 
 
 
 
08f1dd3
67b8f64
e13e21e
67b8f64
 
e13e21e
67b8f64
 
 
 
08f1dd3
67b8f64
 
 
 
 
 
 
 
 
 
 
9b63c71
67b8f64
 
08f1dd3
 
 
e13e21e
67b8f64
45e2824
 
 
3662a0e
a9ac3b9
d7c677a
e13e21e
3662a0e
67b8f64
a97b4db
3662a0e
cbb8031
d7c677a
e13e21e
3662a0e
67b8f64
fb5b190
67b8f64
f73ba75
d7c677a
fb5b190
08f1dd3
67b8f64
 
 
 
 
 
 
 
08f1dd3
d7c677a
 
a97b4db
67b8f64
a97b4db
 
e13e21e
d7c677a
a97b4db
d7c677a
08f1dd3
d7c677a
 
cbb8031
d7c677a
e13e21e
67b8f64
08f1dd3
d7c677a
51ce54e
 
67b8f64
d7c677a
 
 
 
 
67b8f64
 
 
d7c677a
 
 
67b8f64
 
 
d7c677a
 
 
0eced5a
a97b4db
 
67b8f64
 
 
 
 
 
d7c677a
 
 
a97b4db
d7c677a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import gradio as gr
from gradio import update
import stanza
import pandas as pd
import requests
import traceback
from pathlib import Path

# 1. MODEL VARIANTS & INITIALIZATION

LESBIAN_MODELS = {}
MODEL_VARIANTS = {
    "Lesbian-only (UD_Greek-Lesbian)":           "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
    "Lesbian-augmented (UD_Greek-Lesbian+NGUD)": "sbompolas/NGUD-Lesbian-Morphosyntactic-Model",
    "Standard Modern Greek (UD_Greek-GUD)":      "sbompolas/GUD",
    "Cretan-only (UD_Greek-Cretan)":      "sbompolas/Cretan"
}

def download_model_file(url, filename):
    try:
        resp = requests.get(url, stream=True)
        resp.raise_for_status()
        with open(filename, "wb") as f:
            for chunk in resp.iter_content(8192):
                f.write(chunk)
        return True
    except Exception as e:
        print(f"Download failed {filename}: {e}")
        return False

def initialize_models():
    try:
        base = Path("./models")
        base.mkdir(exist_ok=True)
        for name, repo in MODEL_VARIANTS.items():
            out = base / name
            out.mkdir(exist_ok=True)
            files = {
                "tokenizer.pt":  f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
                "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
                "pos.pt":        f"https://huggingface.co/{repo}/resolve/main/pos.pt",
                "depparse.pt":   f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
            }
            for fn, url in files.items():
                tgt = out / fn
                if not tgt.exists() and not download_model_file(url, str(tgt)):
                    return False, f"Failed to download {fn} for {name}"
            cfg = {
                'processors': 'tokenize,pos,lemma,depparse',
                'lang': 'el',
                'use_gpu': False,
                'verbose': False,
                'tokenize_model_path': str(out/"tokenizer.pt"),
                'pos_model_path':      str(out/"pos.pt"),
                'lemma_model_path':    str(out/"lemmatizer.pt"),
                'depparse_model_path': str(out/"depparse.pt")
            }
            LESBIAN_MODELS[name] = stanza.Pipeline(**cfg)
        return True, "Models loaded"
    except Exception as e:
        traceback.print_exc()
        return False, str(e)

loaded, load_status = initialize_models()


# 2. CONLL-U / DATAFRAME

def stanza_doc_to_conllu(doc) -> str:
    lines = []
    for sid, sent in enumerate(doc.sentences, 1):
        lines.append(f"# sent_id = {sid}")
        lines.append(f"# text = {sent.text}")
        for w in sent.words:
            fields = [
                str(w.id), w.text,
                w.lemma or "_", w.upos or "_",
                w.xpos or "_", w.feats or "_",
                str(w.head) if w.head is not None else "0",
                w.deprel or "_", "_", "_"
            ]
            lines.append("\t".join(fields))
        lines.append("")  # blank line after each sentence
    return "\n".join(lines)

def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
    cols = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC']
    rows = []
    for line in conllu.splitlines():
        if not line:
            # empty separator row
            rows.append({c: "" for c in cols})
            continue
        if line.startswith("#"):
            key, val = line[2:].split("=", 1)
            key, val = key.strip(), val.strip()
            if key == "sent_id":
                rows.append({'ID': f"# sent_id = {val}", 'FORM': ""})
            elif key == "text":
                rows.append({'ID': f"# text = {val}", 'FORM': ""})
            continue
        parts = line.split("\t")
        if len(parts) >= 10:
            rows.append(dict(zip(cols, parts)))
    return pd.DataFrame(rows, columns=cols).fillna("")


# 3. FULL SVG BUILDER (crop top + bottom padding, arrows at start)

def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
    try:
        df = pd.DataFrame(sentence_data) if isinstance(sentence_data, list) else sentence_data
        n = len(df)
        base_w, min_sp = 100, 30
        spacing = max(base_w, (n*base_w + (n-1)*min_sp)/n)
        width = max(800, n*spacing + 100)
        orig_height = 500
        crop_top   = 30  # px to remove from top
        bottom_pad = 30  # px to add at bottom
        height     = orig_height - crop_top + bottom_pad

        word_y   = height - 120
        feats_y  = word_y + 35

        colors = {
            'root':'#000000','nsubj':'#2980b9','obj':'#27ae60','det':'#e67e22',
            'amod':'#8e44ad','nmod':'#16a085','case':'#34495e','punct':'#7f8c8d',
            'cc':'#d35400','conj':'#2c3e50','cop':'#e74c3c','mark':'#9b59b6',
            'csubj':'#3498db','xcomp':'#1abc9c','ccomp':'#f39c12','advcl':'#e91e63',
            'advmod':'#9c27b0','obl':'#795548','iobj':'#607d8b','fixed':'#ff5722',
            'aux':'#ff9800','acl':'#4caf50','appos':'#673ab7','compound':'#009688'
        }

        svg = [
            f'<svg width="{width}" height="{height}" viewBox="0 {crop_top} {width} {orig_height}" '
            'xmlns="http://www.w3.org/2000/svg" style="background:white;border:1px solid #eee;"><defs>'
        ]
        for rel, c in colors.items():
            svg.append(
                f'<marker id="m_{rel}" markerWidth="4" markerHeight="4" '
                'markerUnits="userSpaceOnUse" orient="auto-start-reverse" refX="3.5" refY="2">'
                f'<path d="M0,0 L4,2 L0,4Z" fill="{c}"/></marker>'
            )
        svg.append('</defs><g>')

        # compute x positions
        xpos = {
            int(r['ID']): 50 + (int(r['ID']) - 1) * spacing
            for _, r in df.iterrows() if str(r['ID']).isdigit()
        }

        used_spans = []
        for _, r in df.iterrows():
            if not str(r['ID']).isdigit(): 
                continue
            i, h = int(r['ID']), int(r['HEAD'])
            rel, c = r['DEPREL'], colors.get(r['DEPREL'], '#000')
            x1 = xpos[i]
            if h == 0:
                # ROOT line
                svg.append(
                    f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
                    f'stroke="{c}" stroke-width="1.5"/>'
                )
                mid = (word_y-15 + 50) / 2
                svg.append(
                    f'<rect x="{x1-15}" y="{mid-8}" width="30" height="14" '
                    f'fill="white" stroke="{c}" rx="2"/>'
                )
                svg.append(
                    f'<text x="{x1}" y="{mid+2}" text-anchor="middle" '
                    f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">ROOT</text>'
                )
            else:
                x2 = xpos.get(h, x1)
                span = (min(i, h), max(i, h))
                lvl = 0
                conflict = True
                while conflict:
                    conflict = False
                    for (es, el), used_lvl in used_spans:
                        if used_lvl == lvl and not (span[1] < es or span[0] > el):
                            lvl += 1
                            conflict = True
                            break
                used_spans.append((span, lvl))
                dist = abs(x2 - x1)
                arc_h = min(40 + dist * 0.15, 100) + lvl * 35
                midx, cty = (x1 + x2) / 2, word_y - arc_h
                path_d = f'M{x1} {word_y-15} Q{midx} {cty} {x2} {word_y-15}'
                svg.append(
                    f'<path d="{path_d}" stroke="{c}" fill="none" stroke-width="1.5" '
                    f'marker-start="url(#m_{rel})"/>'
                )
                amx = 0.25*x1 + 0.5*midx + 0.25*x2
                amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15)
                lw = len(rel)*6 + 8
                svg.append(
                    f'<rect x="{amx-lw/2}" y="{amy-8}" width="{lw}" height="14" '
                    f'fill="white" stroke="{c}" rx="2"/>'
                )
                svg.append(
                    f'<text x="{amx}" y="{amy+2}" text-anchor="middle" '
                    f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">{rel}</text>'
                )

        # draw tokens + annotations
        for _, r in df.iterrows():
            if not str(r['ID']).isdigit():
                continue
            x = xpos[int(r['ID'])]
            svg.append(
                f'<text x="{x}" y="{word_y}" text-anchor="middle" '
                f'font-family="Arial" font-size="13" font-weight="bold">{r["FORM"]}</text>'
            )
            ann = []
            if r['UPOS'] and r['UPOS'] != '_': ann.append(f"upos={r['UPOS']}")
            if r['LEMMA'] not in ('_', r['FORM']): ann.append(f"lemma={r['LEMMA']}")
            if r['FEATS'] and r['FEATS'] not in ('', '_'):
                for f in r['FEATS'].split('|'):
                    if '=' in f:
                        ann.append(f)
            for i, a in enumerate(ann):
                svg.append(
                    f'<text x="{x}" y="{feats_y + i*12}" text-anchor="middle" '
                    f'font-family="Arial" font-size="7" fill="#666">{a}</text>'
                )

        svg.append('</g></svg>')
        return "".join(svg)
    except Exception as e:
        return f"<p>Error creating SVG: {e}</p>"


# 4. PROCESS & DROPDOWN-UPDATES

def process_text(text, variant):
    if not text.strip():
        return (
            gr.HTML.update(value="<p>No data</p>"),
            gr.Dropdown.update(choices=[], value=None),
            [], "", pd.DataFrame()
        )
    pipe = LESBIAN_MODELS.get(variant)
    if pipe is None:
        return (
            gr.HTML.update(value="<p>Error: model not loaded</p>"),
            gr.Dropdown.update(choices=[], value=None),
            [], "", pd.DataFrame()
        )
    doc = pipe(text)
    conllu = stanza_doc_to_conllu(doc)
    df = conllu_to_dataframe(conllu)

    sentences = []
    for sent in doc.sentences:
        payload = [{
            'ID':     w.id,
            'FORM':   w.text,
            'LEMMA':  w.lemma or "_",
            'UPOS':   w.upos or "_",
            'XPOS':   w.xpos or "_",
            'FEATS':  w.feats or "_",
            'HEAD':   w.head or 0,
            'DEPREL': w.deprel or "_"
        } for w in sent.words]
        sentences.append(payload)

    sent_ids = [str(i+1) for i in range(len(sentences))]
    dd_upd = update(choices=sent_ids, value=sent_ids[0] if sent_ids else None)
    init_svg = create_single_sentence_svg(sentences[0]) if sentences else "<p>No data</p>"

    return init_svg, dd_upd, sentences, conllu, df

def update_svg(selected_id, sentences):
    try:
        idx = int(selected_id) - 1
        return create_single_sentence_svg(sentences[idx])
    except:
        return "<p>Invalid selection</p>"


# 5. BUILD GRADIO UI

def create_app():
    with gr.Blocks(title="Parser for MG Dialects") as app:
        gr.Markdown("# Morphosyntactic Parser for MG Dialects")

        if not loaded:
            gr.Markdown(f"❌ Load error: {load_status}")

        with gr.Row():
            with gr.Column():
                txt = gr.Textbox(label="Input Text", lines=4, placeholder="Εισάγετε κείμενο…")
                mdl = gr.Radio(choices=list(MODEL_VARIANTS.keys()), value="Lesbian-only", label="Model Variant")
                btn = gr.Button("Parse", variant="primary")

        with gr.Row():
            with gr.Column():
                svg_out      = gr.HTML("<p>No visualization</p>")
                sentence_dd  = gr.Dropdown(label="Choose sentence", choices=[])
                sentences_st = gr.State([])

        with gr.Row():
            with gr.Column():
                conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True)
                table_out  = gr.Dataframe(label="Token Table")

        btn.click(
            fn=process_text,
            inputs=[txt, mdl],
            outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out]
        )
        sentence_dd.change(fn=update_svg, inputs=[sentence_dd, sentences_st], outputs=svg_out)

    return app

if __name__ == "__main__":
    create_app().launch()