Update app.py
Browse files
app.py
CHANGED
|
@@ -6,7 +6,7 @@ import requests
|
|
| 6 |
import traceback
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
-
# 1. MODEL VARIANTS & INITIALIZATION
|
| 10 |
|
| 11 |
LESBIAN_MODELS = {}
|
| 12 |
MODEL_VARIANTS = {
|
|
@@ -28,18 +28,18 @@ def download_model_file(url, filename):
|
|
| 28 |
|
| 29 |
def initialize_models():
|
| 30 |
try:
|
| 31 |
-
|
| 32 |
-
|
| 33 |
for name, repo in MODEL_VARIANTS.items():
|
| 34 |
-
out =
|
| 35 |
out.mkdir(exist_ok=True)
|
| 36 |
files = {
|
| 37 |
-
"tokenizer.pt":
|
| 38 |
-
"lemmatizer.pt":
|
| 39 |
-
"pos.pt":
|
| 40 |
-
"depparse.pt":
|
| 41 |
}
|
| 42 |
-
for fn,url in files.items():
|
| 43 |
tgt = out/fn
|
| 44 |
if not tgt.exists() and not download_model_file(url, str(tgt)):
|
| 45 |
return False, f"Failed to download {fn} for {name}"
|
|
@@ -60,11 +60,11 @@ def initialize_models():
|
|
| 60 |
loaded, load_status = initialize_models()
|
| 61 |
|
| 62 |
|
| 63 |
-
# 2.
|
| 64 |
|
| 65 |
def stanza_doc_to_conllu(doc) -> str:
|
| 66 |
lines = []
|
| 67 |
-
for sid, sent in enumerate(doc.sentences,1):
|
| 68 |
lines.append(f"# sent_id = {sid}")
|
| 69 |
lines.append(f"# text = {sent.text}")
|
| 70 |
for w in sent.words:
|
|
@@ -73,126 +73,109 @@ def stanza_doc_to_conllu(doc) -> str:
|
|
| 73 |
w.lemma or "_", w.upos or "_",
|
| 74 |
w.xpos or "_", w.feats or "_",
|
| 75 |
str(w.head) if w.head is not None else "0",
|
| 76 |
-
w.deprel or "_","_","_"
|
| 77 |
]
|
| 78 |
lines.append("\t".join(fields))
|
| 79 |
lines.append("") # blank line after each sentence
|
| 80 |
return "\n".join(lines)
|
| 81 |
|
| 82 |
|
| 83 |
-
# 3. TOKEN TABLE
|
| 84 |
|
| 85 |
-
def
|
| 86 |
"""
|
| 87 |
-
|
| 88 |
-
- (
|
| 89 |
-
-
|
| 90 |
-
-
|
| 91 |
-
|
| 92 |
"""
|
| 93 |
-
blocks = [b for b in conllu.split("\n\n") if b.strip()]
|
| 94 |
records = []
|
|
|
|
| 95 |
first = True
|
| 96 |
-
|
| 97 |
for block in blocks:
|
| 98 |
lines = block.splitlines()
|
| 99 |
-
sid_line
|
| 100 |
-
text_line = lines[1]
|
| 101 |
token_lines = lines[2:]
|
| 102 |
-
|
| 103 |
if not first:
|
| 104 |
# blank row
|
| 105 |
-
records.append({c:"" for c in
|
| 106 |
-
["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]
|
| 107 |
-
})
|
| 108 |
first = False
|
| 109 |
-
|
| 110 |
# comment rows
|
| 111 |
-
records.append({"Id": sid_line,
|
| 112 |
-
|
| 113 |
-
records.append({"Id": text_line, "Form":"", "Lemma":"", "UPosTag":"",
|
| 114 |
-
"XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
|
| 115 |
-
|
| 116 |
# token rows
|
| 117 |
for tl in token_lines:
|
| 118 |
parts = tl.split("\t")
|
| 119 |
if len(parts) < 10:
|
| 120 |
continue
|
| 121 |
records.append({
|
| 122 |
-
"Id":
|
| 123 |
-
"Form":
|
| 124 |
-
"Lemma":
|
| 125 |
"UPosTag": parts[3],
|
| 126 |
"XPosTag": parts[4],
|
| 127 |
-
"Feats":
|
| 128 |
-
"Head":
|
| 129 |
-
"DepRel":
|
| 130 |
-
"Deps":
|
| 131 |
-
"Misc":
|
| 132 |
})
|
|
|
|
| 133 |
|
| 134 |
-
return pd.DataFrame(records, columns=[
|
| 135 |
-
"Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"
|
| 136 |
-
])
|
| 137 |
|
|
|
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
def create_dependency_visualization(conllu: str) -> str:
|
| 142 |
"""
|
| 143 |
-
|
| 144 |
-
- blank line
|
| 145 |
-
-
|
| 146 |
-
- parse lines
|
| 147 |
"""
|
| 148 |
-
|
| 149 |
-
|
|
|
|
| 150 |
first = True
|
| 151 |
-
|
| 152 |
-
for
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
#
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
w
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
out.append(f"{w} ({upos}) --{deprel}--> ROOT")
|
| 180 |
-
|
| 181 |
-
return "\n".join(out)
|
| 182 |
|
| 183 |
|
| 184 |
-
# 5. SVG BUILDER (unchanged)
|
| 185 |
|
| 186 |
def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
# β¦ your SVG builder from the attached file β¦
|
| 192 |
-
return "<svg><!-- your SVG here --></svg>"
|
| 193 |
|
| 194 |
|
| 195 |
-
# 6. PROCESS
|
| 196 |
|
| 197 |
def process_text(text, variant):
|
| 198 |
if not text.strip():
|
|
@@ -209,24 +192,23 @@ def process_text(text, variant):
|
|
| 209 |
[], "", pd.DataFrame(), ""
|
| 210 |
)
|
| 211 |
|
| 212 |
-
# parse
|
| 213 |
doc = pipe(text)
|
| 214 |
conllu = stanza_doc_to_conllu(doc)
|
| 215 |
|
| 216 |
-
#
|
| 217 |
-
|
| 218 |
|
| 219 |
-
# text-based
|
| 220 |
-
|
| 221 |
|
| 222 |
-
# prepare
|
| 223 |
sentences = []
|
| 224 |
for sent in doc.sentences:
|
| 225 |
payload = [{
|
| 226 |
'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
|
| 227 |
'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
|
| 228 |
'FEATS': w.feats or "_", 'HEAD': w.head or 0,
|
| 229 |
-
'DEPREL': w.deprel or "_
|
| 230 |
} for w in sent.words]
|
| 231 |
sentences.append(payload)
|
| 232 |
|
|
@@ -239,24 +221,23 @@ def process_text(text, variant):
|
|
| 239 |
dd_upd,
|
| 240 |
sentences,
|
| 241 |
conllu,
|
| 242 |
-
|
| 243 |
-
|
| 244 |
)
|
| 245 |
|
| 246 |
def update_svg(selected_id, sentences):
|
| 247 |
try:
|
| 248 |
-
idx = int(selected_id)-1
|
| 249 |
return create_single_sentence_svg(sentences[idx])
|
| 250 |
except:
|
| 251 |
return "<p>Invalid selection</p>"
|
| 252 |
|
| 253 |
|
| 254 |
-
# 7. GRADIO UI
|
| 255 |
|
| 256 |
def create_app():
|
| 257 |
with gr.Blocks(title="Lesbian Greek Parser") as app:
|
| 258 |
gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
|
| 259 |
-
|
| 260 |
if not loaded:
|
| 261 |
gr.Markdown(f"β Load error: {load_status}")
|
| 262 |
|
|
@@ -283,7 +264,8 @@ def create_app():
|
|
| 283 |
btn.click(
|
| 284 |
fn=process_text,
|
| 285 |
inputs=[txt, mdl],
|
| 286 |
-
outputs=[svg_out, sentence_dd, sentences_st,
|
|
|
|
| 287 |
)
|
| 288 |
sentence_dd.change(
|
| 289 |
fn=update_svg,
|
|
|
|
| 6 |
import traceback
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
+
# βββ 1. MODEL VARIANTS & INITIALIZATION ββββββββββββββββββββββββββββββββββ
|
| 10 |
|
| 11 |
LESBIAN_MODELS = {}
|
| 12 |
MODEL_VARIANTS = {
|
|
|
|
| 28 |
|
| 29 |
def initialize_models():
|
| 30 |
try:
|
| 31 |
+
base = Path("./models")
|
| 32 |
+
base.mkdir(exist_ok=True)
|
| 33 |
for name, repo in MODEL_VARIANTS.items():
|
| 34 |
+
out = base/name
|
| 35 |
out.mkdir(exist_ok=True)
|
| 36 |
files = {
|
| 37 |
+
"tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
|
| 38 |
+
"lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
|
| 39 |
+
"pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
|
| 40 |
+
"depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
|
| 41 |
}
|
| 42 |
+
for fn, url in files.items():
|
| 43 |
tgt = out/fn
|
| 44 |
if not tgt.exists() and not download_model_file(url, str(tgt)):
|
| 45 |
return False, f"Failed to download {fn} for {name}"
|
|
|
|
| 60 |
loaded, load_status = initialize_models()
|
| 61 |
|
| 62 |
|
| 63 |
+
# βββ 2. CoNLL-U STRINGIZER ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 64 |
|
| 65 |
def stanza_doc_to_conllu(doc) -> str:
|
| 66 |
lines = []
|
| 67 |
+
for sid, sent in enumerate(doc.sentences, 1):
|
| 68 |
lines.append(f"# sent_id = {sid}")
|
| 69 |
lines.append(f"# text = {sent.text}")
|
| 70 |
for w in sent.words:
|
|
|
|
| 73 |
w.lemma or "_", w.upos or "_",
|
| 74 |
w.xpos or "_", w.feats or "_",
|
| 75 |
str(w.head) if w.head is not None else "0",
|
| 76 |
+
w.deprel or "_", "_", "_"
|
| 77 |
]
|
| 78 |
lines.append("\t".join(fields))
|
| 79 |
lines.append("") # blank line after each sentence
|
| 80 |
return "\n".join(lines)
|
| 81 |
|
| 82 |
|
| 83 |
+
# βββ 3. TOKEN TABLE with commentβrows ββββββββββββββββββββββββββββββββββββ
|
| 84 |
|
| 85 |
+
def conllu_to_dataframe_table(conllu: str) -> pd.DataFrame:
|
| 86 |
"""
|
| 87 |
+
Insert:
|
| 88 |
+
- blank row (except first)
|
| 89 |
+
- # sent_id = β¦
|
| 90 |
+
- # text = β¦
|
| 91 |
+
before each sentence's tokens.
|
| 92 |
"""
|
|
|
|
| 93 |
records = []
|
| 94 |
+
blocks = [b for b in conllu.split("\n\n") if b.strip()]
|
| 95 |
first = True
|
|
|
|
| 96 |
for block in blocks:
|
| 97 |
lines = block.splitlines()
|
| 98 |
+
sid_line, txt_line = lines[0], lines[1]
|
|
|
|
| 99 |
token_lines = lines[2:]
|
|
|
|
| 100 |
if not first:
|
| 101 |
# blank row
|
| 102 |
+
records.append({c: "" for c in ["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]})
|
|
|
|
|
|
|
| 103 |
first = False
|
|
|
|
| 104 |
# comment rows
|
| 105 |
+
records.append({"Id": sid_line, "Form":"", "Lemma":"","UPosTag":"","XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
|
| 106 |
+
records.append({"Id": txt_line, "Form":"", "Lemma":"","UPosTag":"","XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
|
|
|
|
|
|
|
|
|
|
| 107 |
# token rows
|
| 108 |
for tl in token_lines:
|
| 109 |
parts = tl.split("\t")
|
| 110 |
if len(parts) < 10:
|
| 111 |
continue
|
| 112 |
records.append({
|
| 113 |
+
"Id": parts[0],
|
| 114 |
+
"Form": parts[1],
|
| 115 |
+
"Lemma": parts[2],
|
| 116 |
"UPosTag": parts[3],
|
| 117 |
"XPosTag": parts[4],
|
| 118 |
+
"Feats": parts[5],
|
| 119 |
+
"Head": parts[6],
|
| 120 |
+
"DepRel": parts[7],
|
| 121 |
+
"Deps": parts[8],
|
| 122 |
+
"Misc": parts[9]
|
| 123 |
})
|
| 124 |
+
return pd.DataFrame(records, columns=["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"])
|
| 125 |
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
+
# βββ 4. TEXTβBASED DEPENDENCIES with blank + comments βββββββββββββββββββββ
|
| 128 |
|
| 129 |
+
def create_dependency_visualization(df_table: pd.DataFrame) -> str:
|
|
|
|
|
|
|
| 130 |
"""
|
| 131 |
+
Walk tokenβtable rows, emitting:
|
| 132 |
+
- blank line + comment lines before each sentence
|
| 133 |
+
- dependency lines
|
|
|
|
| 134 |
"""
|
| 135 |
+
if df_table.empty:
|
| 136 |
+
return "No data to visualize"
|
| 137 |
+
lines = []
|
| 138 |
first = True
|
| 139 |
+
# We detect new sentence by comment rows in Id column starting with '# sent_id'
|
| 140 |
+
for row in df_table.itertuples(index=False):
|
| 141 |
+
if isinstance(row.Id, str) and row.Id.startswith("# sent_id"):
|
| 142 |
+
if not first:
|
| 143 |
+
lines.append("") # blank separator
|
| 144 |
+
first = False
|
| 145 |
+
# emit sent_id and text
|
| 146 |
+
lines.append(row.Id)
|
| 147 |
+
# next row in table is "# text = β¦"
|
| 148 |
+
continue
|
| 149 |
+
if isinstance(row.Id, str) and row.Id.startswith("# text"):
|
| 150 |
+
lines.append(row.Id)
|
| 151 |
+
continue
|
| 152 |
+
# skip blank/comment rows
|
| 153 |
+
if not row.Id.isdigit():
|
| 154 |
+
continue
|
| 155 |
+
# actual token row
|
| 156 |
+
w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
|
| 157 |
+
if h != "0":
|
| 158 |
+
# find head form
|
| 159 |
+
try:
|
| 160 |
+
hw = df_table[df_table.Id == h].iloc[0].Form
|
| 161 |
+
except:
|
| 162 |
+
hw = "[ERR]"
|
| 163 |
+
lines.append(f"{w} ({p}) --{d}--> {hw}")
|
| 164 |
+
else:
|
| 165 |
+
lines.append(f"{w} ({p}) --{d}--> ROOT")
|
| 166 |
+
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
|
| 169 |
+
# βββ 5. SVG BUILDER (unchanged) βββββββββββββββββββββββββββββββββββββββββββ
|
| 170 |
|
| 171 |
def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
|
| 172 |
+
# Paste your entire original SVGβgeneration code here unchanged
|
| 173 |
+
# It must accept sentence_data: list of dicts and return SVG string.
|
| 174 |
+
svg = "<svg><!-- your SVG here --></svg>"
|
| 175 |
+
return svg
|
|
|
|
|
|
|
| 176 |
|
| 177 |
|
| 178 |
+
# βββ 6. PROCESS & DROPDOWN LOGIC ββββββββββββββββββββββββββββββββββββββββββ
|
| 179 |
|
| 180 |
def process_text(text, variant):
|
| 181 |
if not text.strip():
|
|
|
|
| 192 |
[], "", pd.DataFrame(), ""
|
| 193 |
)
|
| 194 |
|
|
|
|
| 195 |
doc = pipe(text)
|
| 196 |
conllu = stanza_doc_to_conllu(doc)
|
| 197 |
|
| 198 |
+
# token table with comments
|
| 199 |
+
df_table = conllu_to_dataframe_table(conllu)
|
| 200 |
|
| 201 |
+
# text-based dependencies
|
| 202 |
+
text_viz = create_dependency_visualization(df_table)
|
| 203 |
|
| 204 |
+
# prepare for SVG dropdown
|
| 205 |
sentences = []
|
| 206 |
for sent in doc.sentences:
|
| 207 |
payload = [{
|
| 208 |
'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
|
| 209 |
'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
|
| 210 |
'FEATS': w.feats or "_", 'HEAD': w.head or 0,
|
| 211 |
+
'DEPREL': w.deprel or "_'
|
| 212 |
} for w in sent.words]
|
| 213 |
sentences.append(payload)
|
| 214 |
|
|
|
|
| 221 |
dd_upd,
|
| 222 |
sentences,
|
| 223 |
conllu,
|
| 224 |
+
df_table,
|
| 225 |
+
text_viz
|
| 226 |
)
|
| 227 |
|
| 228 |
def update_svg(selected_id, sentences):
|
| 229 |
try:
|
| 230 |
+
idx = int(selected_id) - 1
|
| 231 |
return create_single_sentence_svg(sentences[idx])
|
| 232 |
except:
|
| 233 |
return "<p>Invalid selection</p>"
|
| 234 |
|
| 235 |
|
| 236 |
+
# βββ 7. GRADIO UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 237 |
|
| 238 |
def create_app():
|
| 239 |
with gr.Blocks(title="Lesbian Greek Parser") as app:
|
| 240 |
gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
|
|
|
|
| 241 |
if not loaded:
|
| 242 |
gr.Markdown(f"β Load error: {load_status}")
|
| 243 |
|
|
|
|
| 264 |
btn.click(
|
| 265 |
fn=process_text,
|
| 266 |
inputs=[txt, mdl],
|
| 267 |
+
outputs=[svg_out, sentence_dd, sentences_st,
|
| 268 |
+
conllu_out, table_out, text_out]
|
| 269 |
)
|
| 270 |
sentence_dd.change(
|
| 271 |
fn=update_svg,
|