Update app.py
Browse files
app.py
CHANGED
|
@@ -6,7 +6,7 @@ import requests
|
|
| 6 |
import traceback
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
-
#
|
| 10 |
|
| 11 |
LESBIAN_MODELS = {}
|
| 12 |
MODEL_VARIANTS = {
|
|
@@ -34,20 +34,18 @@ def initialize_models():
|
|
| 34 |
out = models_dir/name
|
| 35 |
out.mkdir(exist_ok=True)
|
| 36 |
files = {
|
| 37 |
-
"tokenizer.pt":
|
| 38 |
-
"lemmatizer.pt":
|
| 39 |
-
"pos.pt":
|
| 40 |
-
"depparse.pt":
|
| 41 |
}
|
| 42 |
-
for fn,
|
| 43 |
tgt = out/fn
|
| 44 |
if not tgt.exists() and not download_model_file(url, str(tgt)):
|
| 45 |
return False, f"Failed to download {fn} for {name}"
|
| 46 |
cfg = {
|
| 47 |
'processors': 'tokenize,pos,lemma,depparse',
|
| 48 |
-
'lang': 'el',
|
| 49 |
-
'use_gpu': False,
|
| 50 |
-
'verbose': False,
|
| 51 |
'tokenize_model_path': str(out/"tokenizer.pt"),
|
| 52 |
'pos_model_path': str(out/"pos.pt"),
|
| 53 |
'lemma_model_path': str(out/"lemmatizer.pt"),
|
|
@@ -62,95 +60,139 @@ def initialize_models():
|
|
| 62 |
loaded, load_status = initialize_models()
|
| 63 |
|
| 64 |
|
| 65 |
-
#
|
| 66 |
|
| 67 |
def stanza_doc_to_conllu(doc) -> str:
|
| 68 |
lines = []
|
| 69 |
-
for sid, sent in enumerate(doc.sentences,
|
| 70 |
lines.append(f"# sent_id = {sid}")
|
| 71 |
lines.append(f"# text = {sent.text}")
|
| 72 |
for w in sent.words:
|
| 73 |
fields = [
|
| 74 |
-
str(w.id),
|
| 75 |
-
w.
|
| 76 |
-
w.
|
| 77 |
-
w.upos or "_",
|
| 78 |
-
w.xpos or "_",
|
| 79 |
-
w.feats or "_",
|
| 80 |
str(w.head) if w.head is not None else "0",
|
| 81 |
-
w.deprel or "_",
|
| 82 |
-
"_",
|
| 83 |
-
"_"
|
| 84 |
]
|
| 85 |
lines.append("\t".join(fields))
|
| 86 |
-
lines.append("")
|
| 87 |
return "\n".join(lines)
|
| 88 |
|
|
|
|
|
|
|
|
|
|
| 89 |
def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
|
| 90 |
-
"""
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
])
|
| 114 |
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
| 116 |
"""
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
| 118 |
"""
|
| 119 |
-
if
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
lines = []
|
| 123 |
first = True
|
| 124 |
-
for row in df.itertuples(index=False):
|
| 125 |
-
if row.Id == "1":
|
| 126 |
-
if not first:
|
| 127 |
-
lines.append("") # blank line between sentences
|
| 128 |
-
first = False
|
| 129 |
-
|
| 130 |
-
w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
|
| 131 |
-
if h != "0":
|
| 132 |
-
try:
|
| 133 |
-
hw = df[df.Id == h].iloc[0].Form
|
| 134 |
-
except:
|
| 135 |
-
hw = "[ERR]"
|
| 136 |
-
lines.append(f"{w} ({p}) --{d}--> {hw}")
|
| 137 |
-
else:
|
| 138 |
-
lines.append(f"{w} ({p}) --{d}--> ROOT")
|
| 139 |
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
-
# βββ 3. FULL SVG BUILDER βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 144 |
def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
|
| 145 |
"""
|
| 146 |
-
Paste your original
|
| 147 |
-
It
|
| 148 |
"""
|
| 149 |
-
# β¦ your
|
| 150 |
return "<svg><!-- your SVG here --></svg>"
|
| 151 |
|
| 152 |
|
| 153 |
-
#
|
| 154 |
|
| 155 |
def process_text(text, variant):
|
| 156 |
if not text.strip():
|
|
@@ -167,12 +209,17 @@ def process_text(text, variant):
|
|
| 167 |
[], "", pd.DataFrame(), ""
|
| 168 |
)
|
| 169 |
|
| 170 |
-
|
|
|
|
| 171 |
conllu = stanza_doc_to_conllu(doc)
|
|
|
|
|
|
|
| 172 |
df = conllu_to_dataframe(conllu)
|
| 173 |
-
text_v = create_dependency_visualization(df)
|
| 174 |
|
| 175 |
-
#
|
|
|
|
|
|
|
|
|
|
| 176 |
sentences = []
|
| 177 |
for sent in doc.sentences:
|
| 178 |
payload = [{
|
|
@@ -198,13 +245,13 @@ def process_text(text, variant):
|
|
| 198 |
|
| 199 |
def update_svg(selected_id, sentences):
|
| 200 |
try:
|
| 201 |
-
idx = int(selected_id)
|
| 202 |
return create_single_sentence_svg(sentences[idx])
|
| 203 |
except:
|
| 204 |
return "<p>Invalid selection</p>"
|
| 205 |
|
| 206 |
|
| 207 |
-
#
|
| 208 |
|
| 209 |
def create_app():
|
| 210 |
with gr.Blocks(title="Lesbian Greek Parser") as app:
|
|
@@ -215,16 +262,10 @@ def create_app():
|
|
| 215 |
|
| 216 |
with gr.Row():
|
| 217 |
with gr.Column():
|
| 218 |
-
txt = gr.Textbox(
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
)
|
| 223 |
-
mdl = gr.Radio(
|
| 224 |
-
choices=list(MODEL_VARIANTS.keys()),
|
| 225 |
-
value="Lesbian-only",
|
| 226 |
-
label="Model Variant"
|
| 227 |
-
)
|
| 228 |
btn = gr.Button("Parse", variant="primary")
|
| 229 |
|
| 230 |
with gr.Row():
|
|
@@ -235,29 +276,14 @@ def create_app():
|
|
| 235 |
|
| 236 |
with gr.Row():
|
| 237 |
with gr.Column():
|
| 238 |
-
conllu_out = gr.Textbox(
|
| 239 |
-
label="CoNLL-U",
|
| 240 |
-
lines=10,
|
| 241 |
-
show_copy_button=True
|
| 242 |
-
)
|
| 243 |
table_out = gr.Dataframe(label="Token Table")
|
| 244 |
-
text_out = gr.Textbox(
|
| 245 |
-
label="Text-based Dependencies",
|
| 246 |
-
lines=8,
|
| 247 |
-
show_copy_button=True
|
| 248 |
-
)
|
| 249 |
|
| 250 |
btn.click(
|
| 251 |
fn=process_text,
|
| 252 |
inputs=[txt, mdl],
|
| 253 |
-
outputs=[
|
| 254 |
-
svg_out,
|
| 255 |
-
sentence_dd,
|
| 256 |
-
sentences_st,
|
| 257 |
-
conllu_out,
|
| 258 |
-
table_out,
|
| 259 |
-
text_out
|
| 260 |
-
]
|
| 261 |
)
|
| 262 |
sentence_dd.change(
|
| 263 |
fn=update_svg,
|
|
|
|
| 6 |
import traceback
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
+
# 1. MODEL VARIANTS & INITIALIZATION
|
| 10 |
|
| 11 |
LESBIAN_MODELS = {}
|
| 12 |
MODEL_VARIANTS = {
|
|
|
|
| 34 |
out = models_dir/name
|
| 35 |
out.mkdir(exist_ok=True)
|
| 36 |
files = {
|
| 37 |
+
"tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
|
| 38 |
+
"lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
|
| 39 |
+
"pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
|
| 40 |
+
"depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
|
| 41 |
}
|
| 42 |
+
for fn,url in files.items():
|
| 43 |
tgt = out/fn
|
| 44 |
if not tgt.exists() and not download_model_file(url, str(tgt)):
|
| 45 |
return False, f"Failed to download {fn} for {name}"
|
| 46 |
cfg = {
|
| 47 |
'processors': 'tokenize,pos,lemma,depparse',
|
| 48 |
+
'lang': 'el', 'use_gpu': False, 'verbose': False,
|
|
|
|
|
|
|
| 49 |
'tokenize_model_path': str(out/"tokenizer.pt"),
|
| 50 |
'pos_model_path': str(out/"pos.pt"),
|
| 51 |
'lemma_model_path': str(out/"lemmatizer.pt"),
|
|
|
|
| 60 |
loaded, load_status = initialize_models()
|
| 61 |
|
| 62 |
|
| 63 |
+
# 2. CONLL-U TO COARSE OUTPUT
|
| 64 |
|
| 65 |
def stanza_doc_to_conllu(doc) -> str:
|
| 66 |
lines = []
|
| 67 |
+
for sid, sent in enumerate(doc.sentences,1):
|
| 68 |
lines.append(f"# sent_id = {sid}")
|
| 69 |
lines.append(f"# text = {sent.text}")
|
| 70 |
for w in sent.words:
|
| 71 |
fields = [
|
| 72 |
+
str(w.id), w.text,
|
| 73 |
+
w.lemma or "_", w.upos or "_",
|
| 74 |
+
w.xpos or "_", w.feats or "_",
|
|
|
|
|
|
|
|
|
|
| 75 |
str(w.head) if w.head is not None else "0",
|
| 76 |
+
w.deprel or "_","_","_"
|
|
|
|
|
|
|
| 77 |
]
|
| 78 |
lines.append("\t".join(fields))
|
| 79 |
+
lines.append("") # blank line after each sentence
|
| 80 |
return "\n".join(lines)
|
| 81 |
|
| 82 |
+
|
| 83 |
+
# 3. TOKEN TABLE: insert commentβrows + emptyβrows per sentence
|
| 84 |
+
|
| 85 |
def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
|
| 86 |
+
"""
|
| 87 |
+
Splits conllu into sentenceβblocks, then for each block:
|
| 88 |
+
- (if not first) insert a blank row
|
| 89 |
+
- insert '# sent_id = β¦' row
|
| 90 |
+
- insert '# text = β¦' row
|
| 91 |
+
- then all token rows
|
| 92 |
+
"""
|
| 93 |
+
blocks = [b for b in conllu.split("\n\n") if b.strip()]
|
| 94 |
+
records = []
|
| 95 |
+
first = True
|
| 96 |
+
|
| 97 |
+
for block in blocks:
|
| 98 |
+
lines = block.splitlines()
|
| 99 |
+
sid_line = lines[0]
|
| 100 |
+
text_line = lines[1]
|
| 101 |
+
token_lines = lines[2:]
|
| 102 |
+
|
| 103 |
+
if not first:
|
| 104 |
+
# blank row
|
| 105 |
+
records.append({c:"" for c in
|
| 106 |
+
["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]
|
| 107 |
+
})
|
| 108 |
+
first = False
|
| 109 |
+
|
| 110 |
+
# comment rows
|
| 111 |
+
records.append({"Id": sid_line, "Form":"", "Lemma":"", "UPosTag":"",
|
| 112 |
+
"XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
|
| 113 |
+
records.append({"Id": text_line, "Form":"", "Lemma":"", "UPosTag":"",
|
| 114 |
+
"XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
|
| 115 |
+
|
| 116 |
+
# token rows
|
| 117 |
+
for tl in token_lines:
|
| 118 |
+
parts = tl.split("\t")
|
| 119 |
+
if len(parts) < 10:
|
| 120 |
+
continue
|
| 121 |
+
records.append({
|
| 122 |
+
"Id": parts[0],
|
| 123 |
+
"Form": parts[1],
|
| 124 |
+
"Lemma": parts[2],
|
| 125 |
+
"UPosTag": parts[3],
|
| 126 |
+
"XPosTag": parts[4],
|
| 127 |
+
"Feats": parts[5],
|
| 128 |
+
"Head": parts[6],
|
| 129 |
+
"DepRel": parts[7],
|
| 130 |
+
"Deps": parts[8],
|
| 131 |
+
"Misc": parts[9]
|
| 132 |
+
})
|
| 133 |
+
|
| 134 |
+
return pd.DataFrame(records, columns=[
|
| 135 |
+
"Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"
|
| 136 |
])
|
| 137 |
|
| 138 |
+
|
| 139 |
+
# 4. TEXT-BASED DEPENDENCIES: blank + comment per sentence
|
| 140 |
+
|
| 141 |
+
def create_dependency_visualization(conllu: str) -> str:
|
| 142 |
"""
|
| 143 |
+
Splits by blankβline into sentenceβblocks, then for each:
|
| 144 |
+
- blank line (if not first)
|
| 145 |
+
- comment lines
|
| 146 |
+
- parse lines
|
| 147 |
"""
|
| 148 |
+
blocks = [b for b in conllu.split("\n\n") if b.strip()]
|
| 149 |
+
out = []
|
|
|
|
|
|
|
| 150 |
first = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
+
for block in blocks:
|
| 153 |
+
lines = block.splitlines()
|
| 154 |
+
sid_line, txt_line = lines[0], lines[1]
|
| 155 |
+
token_lines = lines[2:]
|
| 156 |
+
|
| 157 |
+
if not first:
|
| 158 |
+
out.append("") # blank line separator
|
| 159 |
+
first = False
|
| 160 |
+
|
| 161 |
+
out.append(sid_line)
|
| 162 |
+
out.append(txt_line)
|
| 163 |
+
|
| 164 |
+
# build headβform map for this sentence
|
| 165 |
+
id2form = {}
|
| 166 |
+
for tl in token_lines:
|
| 167 |
+
p = tl.split("\t")
|
| 168 |
+
if len(p)>=2:
|
| 169 |
+
id2form[p[0]] = p[1]
|
| 170 |
+
|
| 171 |
+
for tl in token_lines:
|
| 172 |
+
p = tl.split("\t")
|
| 173 |
+
if len(p) < 8:
|
| 174 |
+
continue
|
| 175 |
+
w, upos, head, deprel = p[1], p[3], p[6], p[7]
|
| 176 |
+
if head != "0" and head in id2form:
|
| 177 |
+
out.append(f"{w} ({upos}) --{deprel}--> {id2form[head]}")
|
| 178 |
+
else:
|
| 179 |
+
out.append(f"{w} ({upos}) --{deprel}--> ROOT")
|
| 180 |
|
| 181 |
+
return "\n".join(out)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# 5. SVG BUILDER (unchanged)
|
| 185 |
|
|
|
|
| 186 |
def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
|
| 187 |
"""
|
| 188 |
+
Paste your entire original SVGβgeneration code here unchanged.
|
| 189 |
+
It takes sentence_data: List[dict] and returns an <svg>β¦</svg> string.
|
| 190 |
"""
|
| 191 |
+
# β¦ your SVG builder from the attached file β¦
|
| 192 |
return "<svg><!-- your SVG here --></svg>"
|
| 193 |
|
| 194 |
|
| 195 |
+
# 6. PROCESS + DROPDOWN
|
| 196 |
|
| 197 |
def process_text(text, variant):
|
| 198 |
if not text.strip():
|
|
|
|
| 209 |
[], "", pd.DataFrame(), ""
|
| 210 |
)
|
| 211 |
|
| 212 |
+
# parse
|
| 213 |
+
doc = pipe(text)
|
| 214 |
conllu = stanza_doc_to_conllu(doc)
|
| 215 |
+
|
| 216 |
+
# build token table
|
| 217 |
df = conllu_to_dataframe(conllu)
|
|
|
|
| 218 |
|
| 219 |
+
# text-based deps
|
| 220 |
+
text_v = create_dependency_visualization(conllu)
|
| 221 |
+
|
| 222 |
+
# prepare sentence payloads for SVG
|
| 223 |
sentences = []
|
| 224 |
for sent in doc.sentences:
|
| 225 |
payload = [{
|
|
|
|
| 245 |
|
| 246 |
def update_svg(selected_id, sentences):
|
| 247 |
try:
|
| 248 |
+
idx = int(selected_id)-1
|
| 249 |
return create_single_sentence_svg(sentences[idx])
|
| 250 |
except:
|
| 251 |
return "<p>Invalid selection</p>"
|
| 252 |
|
| 253 |
|
| 254 |
+
# 7. GRADIO UI
|
| 255 |
|
| 256 |
def create_app():
|
| 257 |
with gr.Blocks(title="Lesbian Greek Parser") as app:
|
|
|
|
| 262 |
|
| 263 |
with gr.Row():
|
| 264 |
with gr.Column():
|
| 265 |
+
txt = gr.Textbox(label="Input Text", lines=4,
|
| 266 |
+
placeholder="ΞΞΉΟάγΡΟΞ΅ κΡίμΡνοβ¦")
|
| 267 |
+
mdl = gr.Radio(choices=list(MODEL_VARIANTS.keys()),
|
| 268 |
+
value="Lesbian-only", label="Model Variant")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
btn = gr.Button("Parse", variant="primary")
|
| 270 |
|
| 271 |
with gr.Row():
|
|
|
|
| 276 |
|
| 277 |
with gr.Row():
|
| 278 |
with gr.Column():
|
| 279 |
+
conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
table_out = gr.Dataframe(label="Token Table")
|
| 281 |
+
text_out = gr.Textbox(label="Text-based Dependencies", lines=8, show_copy_button=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
btn.click(
|
| 284 |
fn=process_text,
|
| 285 |
inputs=[txt, mdl],
|
| 286 |
+
outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out, text_out]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
)
|
| 288 |
sentence_dd.change(
|
| 289 |
fn=update_svg,
|