Update app.py
Browse files
app.py
CHANGED
|
@@ -31,7 +31,7 @@ def initialize_models():
|
|
| 31 |
base = Path("./models")
|
| 32 |
base.mkdir(exist_ok=True)
|
| 33 |
for name, repo in MODEL_VARIANTS.items():
|
| 34 |
-
out = base/name
|
| 35 |
out.mkdir(exist_ok=True)
|
| 36 |
files = {
|
| 37 |
"tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
|
|
@@ -40,7 +40,7 @@ def initialize_models():
|
|
| 40 |
"depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
|
| 41 |
}
|
| 42 |
for fn, url in files.items():
|
| 43 |
-
tgt = out/fn
|
| 44 |
if not tgt.exists() and not download_model_file(url, str(tgt)):
|
| 45 |
return False, f"Failed to download {fn} for {name}"
|
| 46 |
cfg = {
|
|
@@ -78,29 +78,26 @@ def stanza_doc_to_conllu(doc) -> str:
|
|
| 78 |
w.deprel or "_", "_", "_"
|
| 79 |
]
|
| 80 |
lines.append("\t".join(fields))
|
| 81 |
-
lines.append("")
|
| 82 |
return "\n".join(lines)
|
| 83 |
|
| 84 |
def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
|
|
|
|
| 85 |
rows = []
|
| 86 |
for line in conllu.splitlines():
|
| 87 |
-
# empty line → separator between sentences
|
| 88 |
if not line:
|
| 89 |
-
|
| 90 |
-
|
| 91 |
continue
|
| 92 |
-
|
| 93 |
-
# comment line → sentence header
|
| 94 |
if line.startswith("#"):
|
| 95 |
-
# ex: "# sent_id = 2"
|
| 96 |
if "=" in line:
|
| 97 |
key, val = line[2:].split("=", 1)
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
'FORM':
|
| 101 |
-
|
|
|
|
| 102 |
continue
|
| 103 |
-
|
| 104 |
parts = line.split("\t")
|
| 105 |
if len(parts) >= 10:
|
| 106 |
rows.append({
|
|
@@ -115,25 +112,22 @@ def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
|
|
| 115 |
'DEPS': parts[8],
|
| 116 |
'MISC': parts[9]
|
| 117 |
})
|
| 118 |
-
|
|
|
|
| 119 |
|
| 120 |
def create_dependency_visualization(df: pd.DataFrame) -> str:
|
| 121 |
if df.empty:
|
| 122 |
return "No data to visualize"
|
| 123 |
viz = []
|
| 124 |
for _, row in df.iterrows():
|
| 125 |
-
rid = row
|
| 126 |
-
#
|
| 127 |
-
if pd.isna(rid):
|
| 128 |
-
continue
|
| 129 |
-
|
| 130 |
-
# sentence header
|
| 131 |
if isinstance(rid, str) and rid.startswith("#"):
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
|
|
|
| 135 |
continue
|
| 136 |
-
|
| 137 |
w, p, d, h = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
|
| 138 |
if h != '0':
|
| 139 |
try:
|
|
@@ -161,10 +155,8 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
|
|
| 161 |
|
| 162 |
width = max(800, word_count * word_spacing + 100)
|
| 163 |
height = 500
|
| 164 |
-
|
| 165 |
word_y = height - 120
|
| 166 |
-
|
| 167 |
-
features_start_y = pos_y + 15
|
| 168 |
|
| 169 |
deprel_colors = {
|
| 170 |
'root': '#000000', 'nsubj': '#2980b9', 'obj': '#27ae60', 'det': '#e67e22',
|
|
@@ -189,24 +181,22 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
|
|
| 189 |
)
|
| 190 |
svg.append('</defs><g>')
|
| 191 |
|
| 192 |
-
#
|
| 193 |
-
word_positions = {
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
|
| 198 |
# draw arcs/lines
|
| 199 |
used_spans = []
|
| 200 |
for _, row in df.iterrows():
|
| 201 |
-
|
| 202 |
-
|
|
|
|
| 203 |
rel = row['DEPREL']
|
| 204 |
-
|
| 205 |
x1 = word_positions[wid]
|
| 206 |
col = deprel_colors.get(rel, '#000')
|
| 207 |
-
|
| 208 |
if hid == 0:
|
| 209 |
-
# root arrow
|
| 210 |
svg.append(
|
| 211 |
f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
|
| 212 |
f'stroke="{col}" stroke-width="1.5"/>'
|
|
@@ -233,17 +223,13 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
|
|
| 233 |
conflict = True
|
| 234 |
break
|
| 235 |
used_spans.append((span, lvl))
|
| 236 |
-
|
| 237 |
dist = abs(x2 - x1)
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
midx = (x1 + x2) / 2
|
| 241 |
-
cty = word_y - arc_h
|
| 242 |
svg.append(
|
| 243 |
-
f'<path d="M
|
| 244 |
f'stroke="{col}" stroke-width="1.5" fill="none" marker-end="url(#arrow_{rel})"/>'
|
| 245 |
)
|
| 246 |
-
# label box
|
| 247 |
amx = 0.25*x1 + 0.5*midx + 0.25*x2
|
| 248 |
amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15)
|
| 249 |
lw = len(rel)*6 + 8
|
|
@@ -258,31 +244,21 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
|
|
| 258 |
|
| 259 |
# draw words + annotations
|
| 260 |
for _, row in df.iterrows():
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
pos = row['UPOS']
|
| 265 |
-
lemma = row['LEMMA']
|
| 266 |
-
feats = row['FEATS']
|
| 267 |
-
xpos = row['XPOS']
|
| 268 |
-
|
| 269 |
-
# word text
|
| 270 |
svg.append(
|
| 271 |
f'<text x="{x}" y="{word_y}" text-anchor="middle" '
|
| 272 |
'font-family="Arial" font-size="13" font-weight="bold">'
|
| 273 |
-
f'{
|
| 274 |
)
|
| 275 |
-
|
| 276 |
-
# annotations underneath
|
| 277 |
ann = []
|
| 278 |
-
if
|
| 279 |
-
if
|
| 280 |
-
if
|
| 281 |
-
|
| 282 |
-
for fpair in feats.split('|'):
|
| 283 |
if '=' in fpair:
|
| 284 |
ann.append(fpair)
|
| 285 |
-
|
| 286 |
for i, a in enumerate(ann):
|
| 287 |
y0 = features_start_y + i*12
|
| 288 |
svg.append(
|
|
@@ -315,7 +291,7 @@ def process_text(text, variant):
|
|
| 315 |
[], "", pd.DataFrame(), ""
|
| 316 |
)
|
| 317 |
|
| 318 |
-
doc
|
| 319 |
conllu = stanza_doc_to_conllu(doc)
|
| 320 |
df = conllu_to_dataframe(conllu)
|
| 321 |
text_v = create_dependency_visualization(df)
|
|
@@ -394,7 +370,7 @@ def create_app():
|
|
| 394 |
table_out = gr.Dataframe(label="Token Table")
|
| 395 |
text_out = gr.Textbox(
|
| 396 |
label="Text-based Dependencies",
|
| 397 |
-
lines=
|
| 398 |
show_copy_button=True
|
| 399 |
)
|
| 400 |
|
|
|
|
| 31 |
base = Path("./models")
|
| 32 |
base.mkdir(exist_ok=True)
|
| 33 |
for name, repo in MODEL_VARIANTS.items():
|
| 34 |
+
out = base / name
|
| 35 |
out.mkdir(exist_ok=True)
|
| 36 |
files = {
|
| 37 |
"tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
|
|
|
|
| 40 |
"depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
|
| 41 |
}
|
| 42 |
for fn, url in files.items():
|
| 43 |
+
tgt = out / fn
|
| 44 |
if not tgt.exists() and not download_model_file(url, str(tgt)):
|
| 45 |
return False, f"Failed to download {fn} for {name}"
|
| 46 |
cfg = {
|
|
|
|
| 78 |
w.deprel or "_", "_", "_"
|
| 79 |
]
|
| 80 |
lines.append("\t".join(fields))
|
| 81 |
+
lines.append("") # blank line after each sentence
|
| 82 |
return "\n".join(lines)
|
| 83 |
|
| 84 |
def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
|
| 85 |
+
cols = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC']
|
| 86 |
rows = []
|
| 87 |
for line in conllu.splitlines():
|
|
|
|
| 88 |
if not line:
|
| 89 |
+
# empty row between sentences
|
| 90 |
+
rows.append({c: "" for c in cols})
|
| 91 |
continue
|
|
|
|
|
|
|
| 92 |
if line.startswith("#"):
|
|
|
|
| 93 |
if "=" in line:
|
| 94 |
key, val = line[2:].split("=", 1)
|
| 95 |
+
key, val = key.strip(), val.strip()
|
| 96 |
+
if key == "sent_id":
|
| 97 |
+
rows.append({'ID': f"# sent_id = {val}", 'FORM': ""})
|
| 98 |
+
elif key == "text":
|
| 99 |
+
rows.append({'ID': f"# text = {val}", 'FORM': ""})
|
| 100 |
continue
|
|
|
|
| 101 |
parts = line.split("\t")
|
| 102 |
if len(parts) >= 10:
|
| 103 |
rows.append({
|
|
|
|
| 112 |
'DEPS': parts[8],
|
| 113 |
'MISC': parts[9]
|
| 114 |
})
|
| 115 |
+
df = pd.DataFrame(rows, columns=cols)
|
| 116 |
+
return df.fillna("") # replace NaN with empty strings
|
| 117 |
|
| 118 |
def create_dependency_visualization(df: pd.DataFrame) -> str:
|
| 119 |
if df.empty:
|
| 120 |
return "No data to visualize"
|
| 121 |
viz = []
|
| 122 |
for _, row in df.iterrows():
|
| 123 |
+
rid = row['ID']
|
| 124 |
+
# sentence header rows
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
if isinstance(rid, str) and rid.startswith("#"):
|
| 126 |
+
# only before each new sent_id do we add a blank line
|
| 127 |
+
if rid.startswith("# sent_id") and viz:
|
| 128 |
+
viz.append("")
|
| 129 |
+
viz.append(rid)
|
| 130 |
continue
|
|
|
|
| 131 |
w, p, d, h = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
|
| 132 |
if h != '0':
|
| 133 |
try:
|
|
|
|
| 155 |
|
| 156 |
width = max(800, word_count * word_spacing + 100)
|
| 157 |
height = 500
|
|
|
|
| 158 |
word_y = height - 120
|
| 159 |
+
features_start_y = word_y + 35 # space for UPOS + lemma
|
|
|
|
| 160 |
|
| 161 |
deprel_colors = {
|
| 162 |
'root': '#000000', 'nsubj': '#2980b9', 'obj': '#27ae60', 'det': '#e67e22',
|
|
|
|
| 181 |
)
|
| 182 |
svg.append('</defs><g>')
|
| 183 |
|
| 184 |
+
# x positions
|
| 185 |
+
word_positions = {
|
| 186 |
+
int(r['ID']): 50 + (int(r['ID']) - 1) * word_spacing
|
| 187 |
+
for _, r in df.iterrows() if str(r['ID']).isdigit()
|
| 188 |
+
}
|
| 189 |
|
| 190 |
# draw arcs/lines
|
| 191 |
used_spans = []
|
| 192 |
for _, row in df.iterrows():
|
| 193 |
+
if not str(row['ID']).isdigit():
|
| 194 |
+
continue
|
| 195 |
+
wid, hid = int(row['ID']), int(row['HEAD'])
|
| 196 |
rel = row['DEPREL']
|
|
|
|
| 197 |
x1 = word_positions[wid]
|
| 198 |
col = deprel_colors.get(rel, '#000')
|
|
|
|
| 199 |
if hid == 0:
|
|
|
|
| 200 |
svg.append(
|
| 201 |
f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
|
| 202 |
f'stroke="{col}" stroke-width="1.5"/>'
|
|
|
|
| 223 |
conflict = True
|
| 224 |
break
|
| 225 |
used_spans.append((span, lvl))
|
|
|
|
| 226 |
dist = abs(x2 - x1)
|
| 227 |
+
arc_h = min(40 + dist * 0.15, 100) + lvl * 35
|
| 228 |
+
midx, cty = (x1 + x2) / 2, word_y - arc_h
|
|
|
|
|
|
|
| 229 |
svg.append(
|
| 230 |
+
f'<path d="M{x1} {word_y-15} Q{midx} {cty} {x2} {word_y-15}" '
|
| 231 |
f'stroke="{col}" stroke-width="1.5" fill="none" marker-end="url(#arrow_{rel})"/>'
|
| 232 |
)
|
|
|
|
| 233 |
amx = 0.25*x1 + 0.5*midx + 0.25*x2
|
| 234 |
amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15)
|
| 235 |
lw = len(rel)*6 + 8
|
|
|
|
| 244 |
|
| 245 |
# draw words + annotations
|
| 246 |
for _, row in df.iterrows():
|
| 247 |
+
if not str(row['ID']).isdigit():
|
| 248 |
+
continue
|
| 249 |
+
x = word_positions[int(row['ID'])]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
svg.append(
|
| 251 |
f'<text x="{x}" y="{word_y}" text-anchor="middle" '
|
| 252 |
'font-family="Arial" font-size="13" font-weight="bold">'
|
| 253 |
+
f'{row["FORM"]}</text>'
|
| 254 |
)
|
|
|
|
|
|
|
| 255 |
ann = []
|
| 256 |
+
if row['UPOS'] and row['UPOS'] != '_': ann.append(f"upos={row['UPOS']}")
|
| 257 |
+
if row['LEMMA'] and row['LEMMA'] not in ('_', row['FORM']): ann.append(f"lemma={row['LEMMA']}")
|
| 258 |
+
if row['FEATS'] and row['FEATS'] not in ('', '_'):
|
| 259 |
+
for fpair in row['FEATS'].split('|'):
|
|
|
|
| 260 |
if '=' in fpair:
|
| 261 |
ann.append(fpair)
|
|
|
|
| 262 |
for i, a in enumerate(ann):
|
| 263 |
y0 = features_start_y + i*12
|
| 264 |
svg.append(
|
|
|
|
| 291 |
[], "", pd.DataFrame(), ""
|
| 292 |
)
|
| 293 |
|
| 294 |
+
doc = pipe(text)
|
| 295 |
conllu = stanza_doc_to_conllu(doc)
|
| 296 |
df = conllu_to_dataframe(conllu)
|
| 297 |
text_v = create_dependency_visualization(df)
|
|
|
|
| 370 |
table_out = gr.Dataframe(label="Token Table")
|
| 371 |
text_out = gr.Textbox(
|
| 372 |
label="Text-based Dependencies",
|
| 373 |
+
lines=10,
|
| 374 |
show_copy_button=True
|
| 375 |
)
|
| 376 |
|