Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,8 @@ import requests
|
|
| 6 |
import traceback
|
| 7 |
from pathlib import Path
|
| 8 |
|
|
|
|
|
|
|
| 9 |
LESBIAN_MODELS = {}
|
| 10 |
MODEL_VARIANTS = {
|
| 11 |
"Lesbian-only": "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
|
|
@@ -59,6 +61,9 @@ def initialize_models():
|
|
| 59 |
|
| 60 |
loaded, load_status = initialize_models()
|
| 61 |
|
|
|
|
|
|
|
|
|
|
| 62 |
def stanza_doc_to_conllu(doc) -> str:
|
| 63 |
lines = []
|
| 64 |
for sid, sent in enumerate(doc.sentences, 1):
|
|
@@ -78,22 +83,37 @@ def stanza_doc_to_conllu(doc) -> str:
|
|
| 78 |
|
| 79 |
def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
|
| 80 |
rows = []
|
| 81 |
-
for
|
| 82 |
-
|
|
|
|
| 83 |
if rows and rows[-1] != {}:
|
| 84 |
rows.append({})
|
| 85 |
continue
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
continue
|
| 91 |
-
|
|
|
|
| 92 |
if len(parts) >= 10:
|
| 93 |
rows.append({
|
| 94 |
-
'ID':
|
| 95 |
-
'
|
| 96 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
})
|
| 98 |
return pd.DataFrame(rows)
|
| 99 |
|
|
@@ -101,14 +121,19 @@ def create_dependency_visualization(df: pd.DataFrame) -> str:
|
|
| 101 |
if df.empty:
|
| 102 |
return "No data to visualize"
|
| 103 |
viz = []
|
| 104 |
-
for
|
| 105 |
-
|
|
|
|
|
|
|
| 106 |
continue
|
| 107 |
-
|
|
|
|
|
|
|
| 108 |
if viz:
|
| 109 |
-
viz.append("")
|
| 110 |
-
viz.append(f"{
|
| 111 |
continue
|
|
|
|
| 112 |
w, p, d, h = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
|
| 113 |
if h != '0':
|
| 114 |
try:
|
|
@@ -120,8 +145,159 @@ def create_dependency_visualization(df: pd.DataFrame) -> str:
|
|
| 120 |
viz.append(f"{w} ({p}) --{d}--> ROOT")
|
| 121 |
return "\n".join(viz)
|
| 122 |
|
| 123 |
-
|
| 124 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
def process_text(text, variant):
|
| 127 |
if not text.strip():
|
|
@@ -146,12 +322,16 @@ def process_text(text, variant):
|
|
| 146 |
|
| 147 |
sentences = []
|
| 148 |
for sent in doc.sentences:
|
| 149 |
-
payload = [
|
| 150 |
-
'ID':
|
| 151 |
-
'
|
| 152 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
'DEPREL': w.deprel or "_"
|
| 154 |
-
} for w in sent.words
|
| 155 |
sentences.append(payload)
|
| 156 |
|
| 157 |
sent_ids = [str(i+1) for i in range(len(sentences))]
|
|
@@ -169,11 +349,14 @@ def process_text(text, variant):
|
|
| 169 |
|
| 170 |
def update_svg(selected_id, sentences):
|
| 171 |
try:
|
| 172 |
-
idx = int(selected_id)-1
|
| 173 |
return create_single_sentence_svg(sentences[idx])
|
| 174 |
except:
|
| 175 |
return "<p>Invalid selection</p>"
|
| 176 |
|
|
|
|
|
|
|
|
|
|
| 177 |
def create_app():
|
| 178 |
with gr.Blocks(title="Lesbian Greek Parser") as app:
|
| 179 |
gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
|
|
@@ -183,11 +366,16 @@ def create_app():
|
|
| 183 |
|
| 184 |
with gr.Row():
|
| 185 |
with gr.Column():
|
| 186 |
-
txt = gr.Textbox(
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
btn = gr.Button("Parse", variant="primary")
|
| 192 |
|
| 193 |
with gr.Row():
|
|
@@ -198,13 +386,17 @@ def create_app():
|
|
| 198 |
|
| 199 |
with gr.Row():
|
| 200 |
with gr.Column():
|
| 201 |
-
conllu_out = gr.Textbox(
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
| 204 |
table_out = gr.Dataframe(label="Token Table")
|
| 205 |
-
text_out = gr.Textbox(
|
| 206 |
-
|
| 207 |
-
|
|
|
|
|
|
|
| 208 |
|
| 209 |
btn.click(
|
| 210 |
fn=process_text,
|
|
|
|
| 6 |
import traceback
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
+
# 1. MODEL VARIANTS & INITIALIZATION
|
| 10 |
+
|
| 11 |
LESBIAN_MODELS = {}
|
| 12 |
MODEL_VARIANTS = {
|
| 13 |
"Lesbian-only": "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
|
|
|
|
| 61 |
|
| 62 |
loaded, load_status = initialize_models()
|
| 63 |
|
| 64 |
+
|
| 65 |
+
# 2. CONLL-U / DATAFRAME / TEXT‐VIZ
|
| 66 |
+
|
| 67 |
def stanza_doc_to_conllu(doc) -> str:
|
| 68 |
lines = []
|
| 69 |
for sid, sent in enumerate(doc.sentences, 1):
|
|
|
|
| 83 |
|
| 84 |
def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
|
| 85 |
rows = []
|
| 86 |
+
for line in conllu.splitlines():
|
| 87 |
+
# empty line → separator between sentences
|
| 88 |
+
if not line:
|
| 89 |
if rows and rows[-1] != {}:
|
| 90 |
rows.append({})
|
| 91 |
continue
|
| 92 |
+
|
| 93 |
+
# comment line → sentence header
|
| 94 |
+
if line.startswith("#"):
|
| 95 |
+
# ex: "# sent_id = 2"
|
| 96 |
+
if "=" in line:
|
| 97 |
+
key, val = line[2:].split("=", 1)
|
| 98 |
+
rows.append({
|
| 99 |
+
'ID': f"# {key.strip()} =",
|
| 100 |
+
'FORM': val.strip()
|
| 101 |
+
})
|
| 102 |
continue
|
| 103 |
+
|
| 104 |
+
parts = line.split("\t")
|
| 105 |
if len(parts) >= 10:
|
| 106 |
rows.append({
|
| 107 |
+
'ID': parts[0],
|
| 108 |
+
'FORM': parts[1],
|
| 109 |
+
'LEMMA': parts[2],
|
| 110 |
+
'UPOS': parts[3],
|
| 111 |
+
'XPOS': parts[4],
|
| 112 |
+
'FEATS': parts[5],
|
| 113 |
+
'HEAD': parts[6],
|
| 114 |
+
'DEPREL': parts[7],
|
| 115 |
+
'DEPS': parts[8],
|
| 116 |
+
'MISC': parts[9]
|
| 117 |
})
|
| 118 |
return pd.DataFrame(rows)
|
| 119 |
|
|
|
|
| 121 |
if df.empty:
|
| 122 |
return "No data to visualize"
|
| 123 |
viz = []
|
| 124 |
+
for _, row in df.iterrows():
|
| 125 |
+
rid = row.get("ID")
|
| 126 |
+
# skip outright blank-rows skeleton
|
| 127 |
+
if pd.isna(rid):
|
| 128 |
continue
|
| 129 |
+
|
| 130 |
+
# sentence header
|
| 131 |
+
if isinstance(rid, str) and rid.startswith("#"):
|
| 132 |
if viz:
|
| 133 |
+
viz.append("") # blank line before new sentence
|
| 134 |
+
viz.append(f"{rid} {row.get('FORM')}")
|
| 135 |
continue
|
| 136 |
+
|
| 137 |
w, p, d, h = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
|
| 138 |
if h != '0':
|
| 139 |
try:
|
|
|
|
| 145 |
viz.append(f"{w} ({p}) --{d}--> ROOT")
|
| 146 |
return "\n".join(viz)
|
| 147 |
|
| 148 |
+
|
| 149 |
+
# 3. FULL SVG BUILDER
|
| 150 |
+
|
| 151 |
+
def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
|
| 152 |
+
try:
|
| 153 |
+
df = pd.DataFrame(sentence_data) if isinstance(sentence_data, list) else sentence_data
|
| 154 |
+
word_count = len(df)
|
| 155 |
+
base_word_width = 100
|
| 156 |
+
min_spacing = 30
|
| 157 |
+
word_spacing = max(
|
| 158 |
+
base_word_width,
|
| 159 |
+
(word_count * base_word_width + min_spacing * (word_count - 1)) / word_count
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
width = max(800, word_count * word_spacing + 100)
|
| 163 |
+
height = 500
|
| 164 |
+
|
| 165 |
+
word_y = height - 120
|
| 166 |
+
pos_y = word_y + 20
|
| 167 |
+
features_start_y = pos_y + 15
|
| 168 |
+
|
| 169 |
+
deprel_colors = {
|
| 170 |
+
'root': '#000000', 'nsubj': '#2980b9', 'obj': '#27ae60', 'det': '#e67e22',
|
| 171 |
+
'amod': '#8e44ad', 'nmod': '#16a085', 'case': '#34495e', 'punct': '#7f8c8d',
|
| 172 |
+
'cc': '#d35400', 'conj': '#2c3e50', 'cop': '#e74c3c', 'mark': '#9b59b6',
|
| 173 |
+
'csubj': '#3498db', 'xcomp': '#1abc9c', 'ccomp': '#f39c12', 'advcl': '#e91e63',
|
| 174 |
+
'advmod': '#9c27b0', 'obl': '#795548', 'iobj': '#607d8b', 'fixed': '#ff5722',
|
| 175 |
+
'aux': '#ff9800', 'acl': '#4caf50', 'appos': '#673ab7', 'compound': '#009688'
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
svg = [
|
| 179 |
+
f'<svg width="{width}" height="{height}" xmlns="http://www.w3.org/2000/svg" '
|
| 180 |
+
'style="background: white; border: 1px solid #eee;">',
|
| 181 |
+
'<defs>'
|
| 182 |
+
]
|
| 183 |
+
for rel, color in deprel_colors.items():
|
| 184 |
+
svg.append(
|
| 185 |
+
f'<marker id="arrow_{rel}" markerWidth="4" markerHeight="4" '
|
| 186 |
+
'markerUnits="userSpaceOnUse" orient="auto" refX="3.5" refY="2">'
|
| 187 |
+
f'<path d="M0,0 L4,2 L0,4 Z" fill="{color}"/>'
|
| 188 |
+
'</marker>'
|
| 189 |
+
)
|
| 190 |
+
svg.append('</defs><g>')
|
| 191 |
+
|
| 192 |
+
# calculate x positions
|
| 193 |
+
word_positions = {}
|
| 194 |
+
for idx, row in df.iterrows():
|
| 195 |
+
wid = int(row['ID'])
|
| 196 |
+
word_positions[wid] = 50 + (wid - 1) * word_spacing
|
| 197 |
+
|
| 198 |
+
# draw arcs/lines
|
| 199 |
+
used_spans = []
|
| 200 |
+
for _, row in df.iterrows():
|
| 201 |
+
wid = int(row['ID'])
|
| 202 |
+
hid = int(row['HEAD']) if row['HEAD'] != '0' else 0
|
| 203 |
+
rel = row['DEPREL']
|
| 204 |
+
|
| 205 |
+
x1 = word_positions[wid]
|
| 206 |
+
col = deprel_colors.get(rel, '#000')
|
| 207 |
+
|
| 208 |
+
if hid == 0:
|
| 209 |
+
# root arrow
|
| 210 |
+
svg.append(
|
| 211 |
+
f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
|
| 212 |
+
f'stroke="{col}" stroke-width="1.5"/>'
|
| 213 |
+
)
|
| 214 |
+
mid = (word_y-15 + 50) / 2
|
| 215 |
+
svg.append(
|
| 216 |
+
f'<rect x="{x1-15}" y="{mid-8}" width="30" height="14" '
|
| 217 |
+
f'fill="white" stroke="{col}" rx="2"/>'
|
| 218 |
+
)
|
| 219 |
+
svg.append(
|
| 220 |
+
f'<text x="{x1}" y="{mid+2}" text-anchor="middle" '
|
| 221 |
+
f'fill="{col}" font-family="Arial" font-size="8" font-weight="bold">ROOT</text>'
|
| 222 |
+
)
|
| 223 |
+
else:
|
| 224 |
+
x2 = word_positions.get(hid, x1)
|
| 225 |
+
span = (min(wid, hid), max(wid, hid))
|
| 226 |
+
lvl = 0
|
| 227 |
+
conflict = True
|
| 228 |
+
while conflict:
|
| 229 |
+
conflict = False
|
| 230 |
+
for (es, el), used_lvl in used_spans:
|
| 231 |
+
if used_lvl == lvl and not (span[1] < es or span[0] > el):
|
| 232 |
+
lvl += 1
|
| 233 |
+
conflict = True
|
| 234 |
+
break
|
| 235 |
+
used_spans.append((span, lvl))
|
| 236 |
+
|
| 237 |
+
dist = abs(x2 - x1)
|
| 238 |
+
base_h = min(40 + dist * 0.15, 100)
|
| 239 |
+
arc_h = base_h + lvl * 35
|
| 240 |
+
midx = (x1 + x2) / 2
|
| 241 |
+
cty = word_y - arc_h
|
| 242 |
+
svg.append(
|
| 243 |
+
f'<path d="M {x1} {word_y-15} Q {midx} {cty} {x2} {word_y-15}" '
|
| 244 |
+
f'stroke="{col}" stroke-width="1.5" fill="none" marker-end="url(#arrow_{rel})"/>'
|
| 245 |
+
)
|
| 246 |
+
# label box
|
| 247 |
+
amx = 0.25*x1 + 0.5*midx + 0.25*x2
|
| 248 |
+
amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15)
|
| 249 |
+
lw = len(rel)*6 + 8
|
| 250 |
+
svg.append(
|
| 251 |
+
f'<rect x="{amx-lw/2}" y="{amy-8}" width="{lw}" height="14" '
|
| 252 |
+
f'fill="white" stroke="{col}" rx="2"/>'
|
| 253 |
+
)
|
| 254 |
+
svg.append(
|
| 255 |
+
f'<text x="{amx}" y="{amy+2}" text-anchor="middle" '
|
| 256 |
+
f'fill="{col}" font-family="Arial" font-size="8" font-weight="bold">{rel}</text>'
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# draw words + annotations
|
| 260 |
+
for _, row in df.iterrows():
|
| 261 |
+
wid = int(row['ID'])
|
| 262 |
+
x = word_positions[wid]
|
| 263 |
+
word = row['FORM']
|
| 264 |
+
pos = row['UPOS']
|
| 265 |
+
lemma = row['LEMMA']
|
| 266 |
+
feats = row['FEATS']
|
| 267 |
+
xpos = row['XPOS']
|
| 268 |
+
|
| 269 |
+
# word text
|
| 270 |
+
svg.append(
|
| 271 |
+
f'<text x="{x}" y="{word_y}" text-anchor="middle" '
|
| 272 |
+
'font-family="Arial" font-size="13" font-weight="bold">'
|
| 273 |
+
f'{word}</text>'
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
# annotations underneath
|
| 277 |
+
ann = []
|
| 278 |
+
if pos and pos != '_': ann.append(f"upos={pos}")
|
| 279 |
+
if lemma and lemma not in ('_', word): ann.append(f"lemma={lemma}")
|
| 280 |
+
if xpos and xpos != '_': ann.append(f"xpos={xpos}")
|
| 281 |
+
if feats and feats not in ('', '_'):
|
| 282 |
+
for fpair in feats.split('|'):
|
| 283 |
+
if '=' in fpair:
|
| 284 |
+
ann.append(fpair)
|
| 285 |
+
|
| 286 |
+
for i, a in enumerate(ann):
|
| 287 |
+
y0 = features_start_y + i*12
|
| 288 |
+
svg.append(
|
| 289 |
+
f'<text x="{x}" y="{y0}" text-anchor="middle" '
|
| 290 |
+
f'font-family="Arial" font-size="7" fill="#666">{a}</text>'
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
svg.append('</g></svg>')
|
| 294 |
+
return "".join(svg)
|
| 295 |
+
|
| 296 |
+
except Exception as e:
|
| 297 |
+
return f"<p>Error creating SVG: {e}</p>"
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
# 4. PROCESS & DROPDOWN-UPDATES
|
| 301 |
|
| 302 |
def process_text(text, variant):
|
| 303 |
if not text.strip():
|
|
|
|
| 322 |
|
| 323 |
sentences = []
|
| 324 |
for sent in doc.sentences:
|
| 325 |
+
payload = [{
|
| 326 |
+
'ID': w.id,
|
| 327 |
+
'FORM': w.text,
|
| 328 |
+
'LEMMA': w.lemma or "_",
|
| 329 |
+
'UPOS': w.upos or "_",
|
| 330 |
+
'XPOS': w.xpos or "_",
|
| 331 |
+
'FEATS': w.feats or "_",
|
| 332 |
+
'HEAD': w.head or 0,
|
| 333 |
'DEPREL': w.deprel or "_"
|
| 334 |
+
} for w in sent.words]
|
| 335 |
sentences.append(payload)
|
| 336 |
|
| 337 |
sent_ids = [str(i+1) for i in range(len(sentences))]
|
|
|
|
| 349 |
|
| 350 |
def update_svg(selected_id, sentences):
|
| 351 |
try:
|
| 352 |
+
idx = int(selected_id) - 1
|
| 353 |
return create_single_sentence_svg(sentences[idx])
|
| 354 |
except:
|
| 355 |
return "<p>Invalid selection</p>"
|
| 356 |
|
| 357 |
+
|
| 358 |
+
# 5. BUILD GRADIO UI
|
| 359 |
+
|
| 360 |
def create_app():
|
| 361 |
with gr.Blocks(title="Lesbian Greek Parser") as app:
|
| 362 |
gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
|
|
|
|
| 366 |
|
| 367 |
with gr.Row():
|
| 368 |
with gr.Column():
|
| 369 |
+
txt = gr.Textbox(
|
| 370 |
+
label="Input Text",
|
| 371 |
+
lines=4,
|
| 372 |
+
placeholder="Εισάγετε κείμενο…"
|
| 373 |
+
)
|
| 374 |
+
mdl = gr.Radio(
|
| 375 |
+
choices=list(MODEL_VARIANTS.keys()),
|
| 376 |
+
value="Lesbian-only",
|
| 377 |
+
label="Model Variant"
|
| 378 |
+
)
|
| 379 |
btn = gr.Button("Parse", variant="primary")
|
| 380 |
|
| 381 |
with gr.Row():
|
|
|
|
| 386 |
|
| 387 |
with gr.Row():
|
| 388 |
with gr.Column():
|
| 389 |
+
conllu_out = gr.Textbox(
|
| 390 |
+
label="CoNLL-U",
|
| 391 |
+
lines=10,
|
| 392 |
+
show_copy_button=True
|
| 393 |
+
)
|
| 394 |
table_out = gr.Dataframe(label="Token Table")
|
| 395 |
+
text_out = gr.Textbox(
|
| 396 |
+
label="Text-based Dependencies",
|
| 397 |
+
lines=8,
|
| 398 |
+
show_copy_button=True
|
| 399 |
+
)
|
| 400 |
|
| 401 |
btn.click(
|
| 402 |
fn=process_text,
|