Update app.py
Browse files
app.py
CHANGED
|
@@ -6,8 +6,7 @@ import requests
|
|
| 6 |
import traceback
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
LESBIAN_MODELS = {}
|
| 12 |
MODEL_VARIANTS = {
|
| 13 |
"Lesbian-only": "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
|
|
@@ -34,10 +33,10 @@ def initialize_models():
|
|
| 34 |
out = base/name
|
| 35 |
out.mkdir(exist_ok=True)
|
| 36 |
files = {
|
| 37 |
-
"tokenizer.pt":
|
| 38 |
-
"lemmatizer.pt":
|
| 39 |
-
"pos.pt":
|
| 40 |
-
"depparse.pt":
|
| 41 |
}
|
| 42 |
for fn, url in files.items():
|
| 43 |
tgt = out/fn
|
|
@@ -60,7 +59,7 @@ def initialize_models():
|
|
| 60 |
loaded, load_status = initialize_models()
|
| 61 |
|
| 62 |
|
| 63 |
-
#
|
| 64 |
|
| 65 |
def stanza_doc_to_conllu(doc) -> str:
|
| 66 |
lines = []
|
|
@@ -80,102 +79,137 @@ def stanza_doc_to_conllu(doc) -> str:
|
|
| 80 |
return "\n".join(lines)
|
| 81 |
|
| 82 |
|
| 83 |
-
#
|
| 84 |
|
| 85 |
-
def
|
| 86 |
"""
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
"""
|
| 93 |
-
records = []
|
| 94 |
blocks = [b for b in conllu.split("\n\n") if b.strip()]
|
|
|
|
| 95 |
first = True
|
|
|
|
| 96 |
for block in blocks:
|
| 97 |
lines = block.splitlines()
|
| 98 |
-
sid_line
|
|
|
|
| 99 |
token_lines = lines[2:]
|
|
|
|
| 100 |
if not first:
|
| 101 |
-
#
|
| 102 |
-
records.append({c:
|
|
|
|
|
|
|
| 103 |
first = False
|
|
|
|
| 104 |
# comment rows
|
| 105 |
-
records.append({"Id": sid_line,
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
| 107 |
# token rows
|
| 108 |
for tl in token_lines:
|
| 109 |
parts = tl.split("\t")
|
| 110 |
if len(parts) < 10:
|
| 111 |
continue
|
| 112 |
records.append({
|
| 113 |
-
"Id":
|
| 114 |
-
"Form":
|
| 115 |
-
"Lemma":
|
| 116 |
-
"UPosTag":
|
| 117 |
-
"XPosTag":
|
| 118 |
-
"Feats":
|
| 119 |
-
"Head":
|
| 120 |
-
"DepRel":
|
| 121 |
-
"Deps":
|
| 122 |
-
"Misc":
|
| 123 |
})
|
| 124 |
-
return pd.DataFrame(records, columns=["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"])
|
| 125 |
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
|
|
|
|
| 128 |
|
| 129 |
def create_dependency_visualization(df_table: pd.DataFrame) -> str:
|
| 130 |
"""
|
| 131 |
-
|
| 132 |
-
- blank line + comment lines before each sentence
|
| 133 |
-
- dependency lines
|
| 134 |
"""
|
| 135 |
if df_table.empty:
|
| 136 |
return "No data to visualize"
|
| 137 |
-
|
| 138 |
first = True
|
| 139 |
-
|
| 140 |
for row in df_table.itertuples(index=False):
|
|
|
|
| 141 |
if isinstance(row.Id, str) and row.Id.startswith("# sent_id"):
|
| 142 |
if not first:
|
| 143 |
-
|
| 144 |
first = False
|
| 145 |
-
|
| 146 |
-
lines.append(row.Id)
|
| 147 |
-
# next row in table is "# text = β¦"
|
| 148 |
continue
|
| 149 |
if isinstance(row.Id, str) and row.Id.startswith("# text"):
|
| 150 |
-
|
| 151 |
continue
|
| 152 |
-
|
|
|
|
| 153 |
if not row.Id.isdigit():
|
| 154 |
continue
|
| 155 |
-
|
|
|
|
| 156 |
w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
|
| 157 |
if h != "0":
|
| 158 |
-
# find head form
|
| 159 |
try:
|
| 160 |
hw = df_table[df_table.Id == h].iloc[0].Form
|
| 161 |
except:
|
| 162 |
hw = "[ERR]"
|
| 163 |
-
|
| 164 |
else:
|
| 165 |
-
|
| 166 |
-
return "\n".join(lines)
|
| 167 |
-
|
| 168 |
|
| 169 |
-
|
| 170 |
|
| 171 |
-
def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
|
| 172 |
-
# Paste your entire original SVGβgeneration code here unchanged
|
| 173 |
-
# It must accept sentence_data: list of dicts and return SVG string.
|
| 174 |
-
svg = "<svg><!-- your SVG here --></svg>"
|
| 175 |
-
return svg
|
| 176 |
|
|
|
|
| 177 |
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
def process_text(text, variant):
|
| 181 |
if not text.strip():
|
|
@@ -185,9 +219,9 @@ def process_text(text, variant):
|
|
| 185 |
[], "", pd.DataFrame(), ""
|
| 186 |
)
|
| 187 |
pipe = LESBIAN_MODELS.get(variant)
|
| 188 |
-
if pipe
|
| 189 |
return (
|
| 190 |
-
gr.HTML.update(value="<p>Error
|
| 191 |
gr.Dropdown.update(choices=[], value=None),
|
| 192 |
[], "", pd.DataFrame(), ""
|
| 193 |
)
|
|
@@ -195,20 +229,20 @@ def process_text(text, variant):
|
|
| 195 |
doc = pipe(text)
|
| 196 |
conllu = stanza_doc_to_conllu(doc)
|
| 197 |
|
| 198 |
-
#
|
| 199 |
-
df_table =
|
| 200 |
|
| 201 |
-
#
|
| 202 |
text_viz = create_dependency_visualization(df_table)
|
| 203 |
|
| 204 |
-
#
|
| 205 |
sentences = []
|
| 206 |
for sent in doc.sentences:
|
| 207 |
payload = [{
|
| 208 |
'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
|
| 209 |
'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
|
| 210 |
'FEATS': w.feats or "_", 'HEAD': w.head or 0,
|
| 211 |
-
'DEPREL': w.deprel or "_
|
| 212 |
} for w in sent.words]
|
| 213 |
sentences.append(payload)
|
| 214 |
|
|
@@ -233,7 +267,7 @@ def update_svg(selected_id, sentences):
|
|
| 233 |
return "<p>Invalid selection</p>"
|
| 234 |
|
| 235 |
|
| 236 |
-
#
|
| 237 |
|
| 238 |
def create_app():
|
| 239 |
with gr.Blocks(title="Lesbian Greek Parser") as app:
|
|
@@ -264,8 +298,10 @@ def create_app():
|
|
| 264 |
btn.click(
|
| 265 |
fn=process_text,
|
| 266 |
inputs=[txt, mdl],
|
| 267 |
-
outputs=[
|
| 268 |
-
|
|
|
|
|
|
|
| 269 |
)
|
| 270 |
sentence_dd.change(
|
| 271 |
fn=update_svg,
|
|
|
|
| 6 |
import traceback
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
+
# 1. MODEL VARIANTS & INITIALIZATION
|
|
|
|
| 10 |
LESBIAN_MODELS = {}
|
| 11 |
MODEL_VARIANTS = {
|
| 12 |
"Lesbian-only": "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
|
|
|
|
| 33 |
out = base/name
|
| 34 |
out.mkdir(exist_ok=True)
|
| 35 |
files = {
|
| 36 |
+
"tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
|
| 37 |
+
"lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
|
| 38 |
+
"pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
|
| 39 |
+
"depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
|
| 40 |
}
|
| 41 |
for fn, url in files.items():
|
| 42 |
tgt = out/fn
|
|
|
|
| 59 |
loaded, load_status = initialize_models()
|
| 60 |
|
| 61 |
|
| 62 |
+
# 2. CoNLL-U STRINGIZER
|
| 63 |
|
| 64 |
def stanza_doc_to_conllu(doc) -> str:
|
| 65 |
lines = []
|
|
|
|
| 79 |
return "\n".join(lines)
|
| 80 |
|
| 81 |
|
| 82 |
+
# 3. TOKEN TABLE WITH COMMENTβROWS
|
| 83 |
|
| 84 |
+
def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
|
| 85 |
"""
|
| 86 |
+
Before each sentence (except first):
|
| 87 |
+
- an empty row
|
| 88 |
+
- a row with '# sent_id = β¦'
|
| 89 |
+
- a row with '# text = β¦'
|
| 90 |
+
Then the token rows.
|
| 91 |
"""
|
|
|
|
| 92 |
blocks = [b for b in conllu.split("\n\n") if b.strip()]
|
| 93 |
+
records = []
|
| 94 |
first = True
|
| 95 |
+
|
| 96 |
for block in blocks:
|
| 97 |
lines = block.splitlines()
|
| 98 |
+
sid_line = lines[0]
|
| 99 |
+
txt_line = lines[1]
|
| 100 |
token_lines = lines[2:]
|
| 101 |
+
|
| 102 |
if not first:
|
| 103 |
+
# empty row
|
| 104 |
+
records.append({c:"" for c in
|
| 105 |
+
["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]
|
| 106 |
+
})
|
| 107 |
first = False
|
| 108 |
+
|
| 109 |
# comment rows
|
| 110 |
+
records.append({"Id": sid_line, "Form":"", "Lemma":"", "UPosTag":"",
|
| 111 |
+
"XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
|
| 112 |
+
records.append({"Id": txt_line, "Form":"", "Lemma":"", "UPosTag":"",
|
| 113 |
+
"XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
|
| 114 |
+
|
| 115 |
# token rows
|
| 116 |
for tl in token_lines:
|
| 117 |
parts = tl.split("\t")
|
| 118 |
if len(parts) < 10:
|
| 119 |
continue
|
| 120 |
records.append({
|
| 121 |
+
"Id": parts[0],
|
| 122 |
+
"Form": parts[1],
|
| 123 |
+
"Lemma": parts[2],
|
| 124 |
+
"UPosTag": parts[3],
|
| 125 |
+
"XPosTag": parts[4],
|
| 126 |
+
"Feats": parts[5],
|
| 127 |
+
"Head": parts[6],
|
| 128 |
+
"DepRel": parts[7],
|
| 129 |
+
"Deps": parts[8],
|
| 130 |
+
"Misc": parts[9]
|
| 131 |
})
|
|
|
|
| 132 |
|
| 133 |
+
return pd.DataFrame(records, columns=[
|
| 134 |
+
"Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"
|
| 135 |
+
])
|
| 136 |
|
| 137 |
+
|
| 138 |
+
# 4. TEXTβBASED DEPENDENCIES WITH BLANK+COMMENTS
|
| 139 |
|
| 140 |
def create_dependency_visualization(df_table: pd.DataFrame) -> str:
|
| 141 |
"""
|
| 142 |
+
Inserts a blank line + comment lines before each sentence.
|
|
|
|
|
|
|
| 143 |
"""
|
| 144 |
if df_table.empty:
|
| 145 |
return "No data to visualize"
|
| 146 |
+
out = []
|
| 147 |
first = True
|
| 148 |
+
|
| 149 |
for row in df_table.itertuples(index=False):
|
| 150 |
+
# detect comment row
|
| 151 |
if isinstance(row.Id, str) and row.Id.startswith("# sent_id"):
|
| 152 |
if not first:
|
| 153 |
+
out.append("") # blank line
|
| 154 |
first = False
|
| 155 |
+
out.append(row.Id)
|
|
|
|
|
|
|
| 156 |
continue
|
| 157 |
if isinstance(row.Id, str) and row.Id.startswith("# text"):
|
| 158 |
+
out.append(row.Id)
|
| 159 |
continue
|
| 160 |
+
|
| 161 |
+
# skip blank/comment
|
| 162 |
if not row.Id.isdigit():
|
| 163 |
continue
|
| 164 |
+
|
| 165 |
+
# token row
|
| 166 |
w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
|
| 167 |
if h != "0":
|
|
|
|
| 168 |
try:
|
| 169 |
hw = df_table[df_table.Id == h].iloc[0].Form
|
| 170 |
except:
|
| 171 |
hw = "[ERR]"
|
| 172 |
+
out.append(f"{w} ({p}) --{d}--> {hw}")
|
| 173 |
else:
|
| 174 |
+
out.append(f"{w} ({p}) --{d}--> ROOT")
|
|
|
|
|
|
|
| 175 |
|
| 176 |
+
return "\n".join(out)
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
+
# 5. FULL SVG BUILDER (UNCHANGED)
|
| 180 |
|
| 181 |
+
def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
|
| 182 |
+
"""
|
| 183 |
+
Paste your complete SVGβgeneration function here, exactly as before.
|
| 184 |
+
sentence_data is a list of dicts; return an <svg>β¦</svg> string.
|
| 185 |
+
"""
|
| 186 |
+
# -- your original code below --
|
| 187 |
+
df = pd.DataFrame(sentence_data)
|
| 188 |
+
word_count = len(df)
|
| 189 |
+
base_w, min_sp = 100, 30
|
| 190 |
+
spacing = max(base_w, (word_count*base_w + min_sp*(word_count-1))/word_count)
|
| 191 |
+
width = max(800, word_count*spacing + 100)
|
| 192 |
+
height = 500
|
| 193 |
+
word_y = height - 120
|
| 194 |
+
features_start_y = word_y + 20 + 15
|
| 195 |
+
|
| 196 |
+
deprel_colors = {
|
| 197 |
+
'root': '#000000', 'nsubj':'#2980b9', 'obj':'#27ae60', 'det':'#e67e22',
|
| 198 |
+
# β¦ and the rest β¦
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
svg_parts = [
|
| 202 |
+
f'<svg width="{width}" height="{height}" xmlns="http://www.w3.org/2000/svg" '
|
| 203 |
+
'style="background:white;border:1px solid #eee"><defs>'
|
| 204 |
+
]
|
| 205 |
+
# β¦ arrowhead markers, arcs, labels, words & feats exactly as before β¦
|
| 206 |
+
svg_parts.append('</defs><g>')
|
| 207 |
+
# (Insert your entire previous implementation here)
|
| 208 |
+
svg_parts.append('</g></svg>')
|
| 209 |
+
return "".join(svg_parts)
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
# 6. PROCESS + DROPDOWN
|
| 213 |
|
| 214 |
def process_text(text, variant):
|
| 215 |
if not text.strip():
|
|
|
|
| 219 |
[], "", pd.DataFrame(), ""
|
| 220 |
)
|
| 221 |
pipe = LESBIAN_MODELS.get(variant)
|
| 222 |
+
if not pipe:
|
| 223 |
return (
|
| 224 |
+
gr.HTML.update(value="<p>Error loading model</p>"),
|
| 225 |
gr.Dropdown.update(choices=[], value=None),
|
| 226 |
[], "", pd.DataFrame(), ""
|
| 227 |
)
|
|
|
|
| 229 |
doc = pipe(text)
|
| 230 |
conllu = stanza_doc_to_conllu(doc)
|
| 231 |
|
| 232 |
+
# Token table with comments
|
| 233 |
+
df_table = conllu_to_dataframe(conllu)
|
| 234 |
|
| 235 |
+
# Textβbased dependencies
|
| 236 |
text_viz = create_dependency_visualization(df_table)
|
| 237 |
|
| 238 |
+
# Sentence payloads for SVG
|
| 239 |
sentences = []
|
| 240 |
for sent in doc.sentences:
|
| 241 |
payload = [{
|
| 242 |
'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
|
| 243 |
'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
|
| 244 |
'FEATS': w.feats or "_", 'HEAD': w.head or 0,
|
| 245 |
+
'DEPREL': w.deprel or "_"
|
| 246 |
} for w in sent.words]
|
| 247 |
sentences.append(payload)
|
| 248 |
|
|
|
|
| 267 |
return "<p>Invalid selection</p>"
|
| 268 |
|
| 269 |
|
| 270 |
+
# 7. GRADIO UI
|
| 271 |
|
| 272 |
def create_app():
|
| 273 |
with gr.Blocks(title="Lesbian Greek Parser") as app:
|
|
|
|
| 298 |
btn.click(
|
| 299 |
fn=process_text,
|
| 300 |
inputs=[txt, mdl],
|
| 301 |
+
outputs=[
|
| 302 |
+
svg_out, sentence_dd, sentences_st,
|
| 303 |
+
conllu_out, table_out, text_out
|
| 304 |
+
]
|
| 305 |
)
|
| 306 |
sentence_dd.change(
|
| 307 |
fn=update_svg,
|