Update app.py
Browse files
app.py
CHANGED
|
@@ -9,173 +9,916 @@ import requests
|
|
| 9 |
from pathlib import Path
|
| 10 |
import json
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
MODEL_CONFIGS = {
|
| 15 |
-
"Lesbian-only": {
|
| 16 |
-
"files": {
|
| 17 |
-
"tokenizer.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/tokenizer.pt",
|
| 18 |
-
"lemmatizer.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/lemmatizer.pt",
|
| 19 |
-
"pos.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/pos.pt",
|
| 20 |
-
"depparse.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/depparse.pt"
|
| 21 |
-
}
|
| 22 |
-
},
|
| 23 |
-
"Lesbian-synthetic-data": {
|
| 24 |
-
"files": {
|
| 25 |
-
"tokenizer.pt": "https://huggingface.co/sbompolas/NGUD-Lesbian-Morphosyntactic-Model/resolve/main/tokenizer.pt",
|
| 26 |
-
"lemmatizer.pt": "https://huggingface.co/sbompolas/NGUD-Lesbian-Morphosyntactic-Model/resolve/main/lemmatizer.pt",
|
| 27 |
-
"pos.pt": "https://huggingface.co/sbompolas/NGUD-Lesbian-Morphosyntactic-Model/resolve/main/pos.pt",
|
| 28 |
-
"depparse.pt": "https://huggingface.co/sbompolas/NGUD-Lesbian-Morphosyntactic-Model/resolve/main/depparse.pt"
|
| 29 |
-
}
|
| 30 |
-
}
|
| 31 |
-
}
|
| 32 |
|
| 33 |
def download_model_file(url, filename):
|
|
|
|
| 34 |
try:
|
|
|
|
| 35 |
response = requests.get(url, stream=True)
|
| 36 |
response.raise_for_status()
|
|
|
|
| 37 |
with open(filename, 'wb') as f:
|
| 38 |
for chunk in response.iter_content(chunk_size=8192):
|
| 39 |
f.write(chunk)
|
|
|
|
| 40 |
return True
|
| 41 |
except Exception as e:
|
| 42 |
print(f"Failed to download {filename}: {e}")
|
| 43 |
return False
|
| 44 |
|
| 45 |
-
def
|
| 46 |
-
|
|
|
|
| 47 |
models_dir.mkdir(exist_ok=True)
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
if not local_path.exists():
|
| 52 |
if not download_model_file(url, str(local_path)):
|
| 53 |
-
return False, f"Failed to download {
|
|
|
|
| 54 |
return True, models_dir
|
| 55 |
|
| 56 |
-
def
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
'
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
def stanza_doc_to_conllu(doc):
|
|
|
|
| 81 |
conllu_lines = []
|
|
|
|
| 82 |
for sent_idx, sentence in enumerate(doc.sentences):
|
|
|
|
| 83 |
conllu_lines.append(f"# sent_id = {sent_idx + 1}")
|
| 84 |
conllu_lines.append(f"# text = {sentence.text}")
|
|
|
|
| 85 |
for word in sentence.words:
|
|
|
|
| 86 |
fields = [
|
| 87 |
-
str(word.id),
|
| 88 |
-
word.
|
| 89 |
-
word.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
]
|
| 91 |
conllu_lines.append("\t".join(fields))
|
|
|
|
|
|
|
| 92 |
conllu_lines.append("")
|
|
|
|
| 93 |
return "\n".join(conllu_lines)
|
| 94 |
|
| 95 |
-
def
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
parts = line.split('\t')
|
| 107 |
if len(parts) >= 10:
|
| 108 |
-
|
| 109 |
-
'ID': parts[0],
|
| 110 |
-
'
|
| 111 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
})
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
if not text.strip():
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
try:
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
except Exception as e:
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
df = dfs[index]
|
| 143 |
-
tree = create_simple_tree(df)
|
| 144 |
-
return tree, f"Sentence {index+1} of {len(dfs)}"
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
def create_gradio_app():
|
| 147 |
-
with gr.Blocks() as
|
| 148 |
-
gr.Markdown("
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
app = create_gradio_app()
|
| 181 |
-
app
|
|
|
|
|
|
| 9 |
from pathlib import Path
|
| 10 |
import json
|
| 11 |
|
| 12 |
+
# Global variable to store the Lesbian Greek model
|
| 13 |
+
LESBIAN_GREEK_MODEL = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
def download_model_file(url, filename):
|
| 16 |
+
"""Download a model file from Hugging Face"""
|
| 17 |
try:
|
| 18 |
+
print(f"Downloading {filename}...")
|
| 19 |
response = requests.get(url, stream=True)
|
| 20 |
response.raise_for_status()
|
| 21 |
+
|
| 22 |
with open(filename, 'wb') as f:
|
| 23 |
for chunk in response.iter_content(chunk_size=8192):
|
| 24 |
f.write(chunk)
|
| 25 |
+
print(f"Successfully downloaded {filename}")
|
| 26 |
return True
|
| 27 |
except Exception as e:
|
| 28 |
print(f"Failed to download {filename}: {e}")
|
| 29 |
return False
|
| 30 |
|
| 31 |
+
def setup_lesbian_greek_models():
|
| 32 |
+
"""Download and setup the Lesbian Greek models"""
|
| 33 |
+
models_dir = Path("./lesbian_greek_models")
|
| 34 |
models_dir.mkdir(exist_ok=True)
|
| 35 |
+
|
| 36 |
+
# Model URLs from the Hugging Face repository - using simplified URLs as requested
|
| 37 |
+
model_files = {
|
| 38 |
+
"tokenizer.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/tokenizer.pt",
|
| 39 |
+
"lemmatizer.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/lemmatizer.pt",
|
| 40 |
+
"pos.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/pos.pt",
|
| 41 |
+
"depparse.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/depparse.pt"
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# Download all model files
|
| 45 |
+
for local_name, url in model_files.items():
|
| 46 |
+
local_path = models_dir / local_name
|
| 47 |
if not local_path.exists():
|
| 48 |
if not download_model_file(url, str(local_path)):
|
| 49 |
+
return False, f"Failed to download {local_name}"
|
| 50 |
+
|
| 51 |
return True, models_dir
|
| 52 |
|
| 53 |
+
def initialize_lesbian_greek_model():
|
| 54 |
+
"""Initialize Stanza model for Lesbian Greek using custom models"""
|
| 55 |
+
global LESBIAN_GREEK_MODEL
|
| 56 |
+
try:
|
| 57 |
+
print("Setting up Lesbian Greek models...")
|
| 58 |
+
|
| 59 |
+
# Setup custom models
|
| 60 |
+
success, models_dir = setup_lesbian_greek_models()
|
| 61 |
+
if not success:
|
| 62 |
+
return False, models_dir # models_dir contains error message in this case
|
| 63 |
+
|
| 64 |
+
print("Initializing Lesbian Greek pipeline with custom models...")
|
| 65 |
+
|
| 66 |
+
# First, let's try a completely different approach - bypass Stanza's resource system
|
| 67 |
+
print("Attempting bypass of Stanza resource system...")
|
| 68 |
+
try:
|
| 69 |
+
# Create a minimal temporary resources file to satisfy Stanza's requirements
|
| 70 |
+
temp_stanza_dir = Path("./temp_stanza")
|
| 71 |
+
temp_stanza_dir.mkdir(exist_ok=True)
|
| 72 |
+
|
| 73 |
+
# Create a basic resources.json with Greek language support
|
| 74 |
+
temp_resources = {
|
| 75 |
+
"el": {
|
| 76 |
+
"lang_name": "Greek",
|
| 77 |
+
"packages": {
|
| 78 |
+
"default": {
|
| 79 |
+
"processors": {
|
| 80 |
+
"tokenize": "default.pt",
|
| 81 |
+
"pos": "default.pt",
|
| 82 |
+
"lemma": "default.pt",
|
| 83 |
+
"depparse": "default.pt"
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
},
|
| 87 |
+
"default_packages": {
|
| 88 |
+
"tokenize": "default",
|
| 89 |
+
"pos": "default",
|
| 90 |
+
"lemma": "default",
|
| 91 |
+
"depparse": "default"
|
| 92 |
+
}
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
temp_resources_path = temp_stanza_dir / "resources.json"
|
| 97 |
+
with open(temp_resources_path, 'w', encoding='utf-8') as f:
|
| 98 |
+
json.dump(temp_resources, f, indent=2)
|
| 99 |
+
|
| 100 |
+
# Set the temporary directory
|
| 101 |
+
os.environ['STANZA_RESOURCES_DIR'] = str(temp_stanza_dir)
|
| 102 |
+
|
| 103 |
+
# Try direct model paths with explicit model files
|
| 104 |
+
model_paths = {
|
| 105 |
+
'tokenize_model_path': str(models_dir / "tokenizer.pt"),
|
| 106 |
+
'pos_model_path': str(models_dir / "pos.pt"),
|
| 107 |
+
'lemma_model_path': str(models_dir / "lemmatizer.pt"),
|
| 108 |
+
'depparse_model_path': str(models_dir / "depparse.pt")
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
print("Trying direct model paths with temp resources...")
|
| 112 |
+
for key, path in model_paths.items():
|
| 113 |
+
print(f" {key}: {path}")
|
| 114 |
+
if not Path(path).exists():
|
| 115 |
+
raise FileNotFoundError(f"Model file not found: {path}")
|
| 116 |
+
|
| 117 |
+
config = {
|
| 118 |
+
'processors': 'tokenize,pos,lemma,depparse',
|
| 119 |
+
'lang': 'el',
|
| 120 |
+
'use_gpu': False,
|
| 121 |
+
'verbose': False,
|
| 122 |
+
'download_method': None,
|
| 123 |
+
**model_paths
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
LESBIAN_GREEK_MODEL = stanza.Pipeline(**config)
|
| 127 |
+
print("SUCCESS: Direct model loading with temp resources worked!")
|
| 128 |
+
return True, "✅ Custom Lesbian Greek models loaded successfully"
|
| 129 |
+
|
| 130 |
+
except Exception as e1:
|
| 131 |
+
print(f"Direct approach with temp resources failed: {e1}")
|
| 132 |
+
|
| 133 |
+
# Try the original approach with better error handling
|
| 134 |
+
try:
|
| 135 |
+
print("Trying improved package approach...")
|
| 136 |
+
|
| 137 |
+
# Create proper Stanza resources directory structure
|
| 138 |
+
stanza_dir = Path("./stanza_resources")
|
| 139 |
+
stanza_dir.mkdir(exist_ok=True)
|
| 140 |
+
|
| 141 |
+
# Set environment variable
|
| 142 |
+
os.environ['STANZA_RESOURCES_DIR'] = str(stanza_dir)
|
| 143 |
+
|
| 144 |
+
# Create language directory structure
|
| 145 |
+
lang_dir = stanza_dir / "el"
|
| 146 |
+
lang_dir.mkdir(exist_ok=True)
|
| 147 |
+
|
| 148 |
+
# Copy models to expected locations with consistent naming
|
| 149 |
+
processors = ["tokenize", "pos", "lemma", "depparse"]
|
| 150 |
+
model_files = {
|
| 151 |
+
"tokenize": "tokenizer.pt",
|
| 152 |
+
"pos": "pos.pt",
|
| 153 |
+
"lemma": "lemmatizer.pt",
|
| 154 |
+
"depparse": "depparse.pt"
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
for proc in processors:
|
| 158 |
+
proc_dir = lang_dir / proc
|
| 159 |
+
proc_dir.mkdir(exist_ok=True)
|
| 160 |
+
|
| 161 |
+
src_file = models_dir / model_files[proc]
|
| 162 |
+
dst_file = proc_dir / "lesbian.pt"
|
| 163 |
+
|
| 164 |
+
if src_file.exists():
|
| 165 |
+
import shutil
|
| 166 |
+
shutil.copy2(str(src_file), str(dst_file))
|
| 167 |
+
print(f"Copied {model_files[proc]} to {dst_file}")
|
| 168 |
+
|
| 169 |
+
# Create an improved resources.json
|
| 170 |
+
resources_json = {
|
| 171 |
+
"el": {
|
| 172 |
+
"lang_name": "Greek",
|
| 173 |
+
"packages": {
|
| 174 |
+
"lesbian": {
|
| 175 |
+
"processors": {
|
| 176 |
+
"tokenize": "lesbian.pt",
|
| 177 |
+
"pos": "lesbian.pt",
|
| 178 |
+
"lemma": "lesbian.pt",
|
| 179 |
+
"depparse": "lesbian.pt"
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
},
|
| 183 |
+
"default_packages": {
|
| 184 |
+
"tokenize": "lesbian",
|
| 185 |
+
"pos": "lesbian",
|
| 186 |
+
"lemma": "lesbian",
|
| 187 |
+
"depparse": "lesbian"
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
resources_path = stanza_dir / "resources.json"
|
| 193 |
+
with open(resources_path, 'w', encoding='utf-8') as f:
|
| 194 |
+
json.dump(resources_json, f, indent=2)
|
| 195 |
+
print(f"Created improved resources.json at {resources_path}")
|
| 196 |
+
|
| 197 |
+
# Try with explicit package name and directory
|
| 198 |
+
config = {
|
| 199 |
+
'processors': 'tokenize,pos,lemma,depparse',
|
| 200 |
+
'lang': 'el',
|
| 201 |
+
'package': 'lesbian',
|
| 202 |
+
'use_gpu': False,
|
| 203 |
+
'verbose': False,
|
| 204 |
+
'download_method': None,
|
| 205 |
+
'dir': str(stanza_dir)
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
LESBIAN_GREEK_MODEL = stanza.Pipeline(**config)
|
| 209 |
+
print("SUCCESS: Package approach worked!")
|
| 210 |
+
return True, "✅ Custom Lesbian Greek models loaded successfully"
|
| 211 |
+
|
| 212 |
+
except Exception as e2:
|
| 213 |
+
print(f"Package approach failed: {e2}")
|
| 214 |
+
|
| 215 |
+
# Try direct paths without package system
|
| 216 |
+
try:
|
| 217 |
+
print("Trying direct paths without any package system...")
|
| 218 |
+
|
| 219 |
+
# Reset environment to avoid conflicts
|
| 220 |
+
if 'STANZA_RESOURCES_DIR' in os.environ:
|
| 221 |
+
del os.environ['STANZA_RESOURCES_DIR']
|
| 222 |
+
|
| 223 |
+
model_paths = {
|
| 224 |
+
'tokenize_model_path': str(models_dir / "tokenizer.pt"),
|
| 225 |
+
'pos_model_path': str(models_dir / "pos.pt"),
|
| 226 |
+
'lemma_model_path': str(models_dir / "lemmatizer.pt"),
|
| 227 |
+
'depparse_model_path': str(models_dir / "depparse.pt")
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
config = {
|
| 231 |
+
'processors': 'tokenize,pos,lemma,depparse',
|
| 232 |
+
'lang': 'el',
|
| 233 |
+
'use_gpu': False,
|
| 234 |
+
'verbose': False,
|
| 235 |
+
**model_paths
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
LESBIAN_GREEK_MODEL = stanza.Pipeline(**config)
|
| 239 |
+
print("SUCCESS: Direct paths without package system worked!")
|
| 240 |
+
return True, "✅ Custom Lesbian Greek models loaded successfully"
|
| 241 |
+
|
| 242 |
+
except Exception as e3:
|
| 243 |
+
print(f"Direct paths approach failed: {e3}")
|
| 244 |
+
|
| 245 |
+
# Final fallback to default Greek models
|
| 246 |
+
try:
|
| 247 |
+
print("Falling back to default Greek models...")
|
| 248 |
+
LESBIAN_GREEK_MODEL = stanza.Pipeline(
|
| 249 |
+
lang='el',
|
| 250 |
+
processors='tokenize,pos,lemma,depparse',
|
| 251 |
+
use_gpu=False,
|
| 252 |
+
verbose=False
|
| 253 |
+
)
|
| 254 |
+
return True, "⚠️ Using default Greek models (not Lesbian dialect specific)"
|
| 255 |
+
except Exception as e4:
|
| 256 |
+
print(f"Even fallback failed: {e4}")
|
| 257 |
+
return False, f"All approaches failed. Last error: {str(e4)}"
|
| 258 |
+
|
| 259 |
+
except Exception as e:
|
| 260 |
+
error_msg = f"Failed to initialize: {e}"
|
| 261 |
+
print(error_msg)
|
| 262 |
+
traceback.print_exc()
|
| 263 |
+
return False, error_msg
|
| 264 |
|
| 265 |
+
def stanza_doc_to_conllu(doc) -> str:
|
| 266 |
+
"""Convert Stanza Document to CoNLL-U format manually"""
|
| 267 |
conllu_lines = []
|
| 268 |
+
|
| 269 |
for sent_idx, sentence in enumerate(doc.sentences):
|
| 270 |
+
# Add sentence comment
|
| 271 |
conllu_lines.append(f"# sent_id = {sent_idx + 1}")
|
| 272 |
conllu_lines.append(f"# text = {sentence.text}")
|
| 273 |
+
|
| 274 |
for word in sentence.words:
|
| 275 |
+
# CoNLL-U format: ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC
|
| 276 |
fields = [
|
| 277 |
+
str(word.id), # ID
|
| 278 |
+
word.text, # FORM
|
| 279 |
+
word.lemma if word.lemma else "_", # LEMMA
|
| 280 |
+
word.upos if word.upos else "_", # UPOS
|
| 281 |
+
word.xpos if word.xpos else "_", # XPOS
|
| 282 |
+
word.feats if word.feats else "_", # FEATS
|
| 283 |
+
str(word.head) if word.head else "0", # HEAD
|
| 284 |
+
word.deprel if word.deprel else "_", # DEPREL
|
| 285 |
+
"_", # DEPS (enhanced dependencies)
|
| 286 |
+
"_" # MISC
|
| 287 |
]
|
| 288 |
conllu_lines.append("\t".join(fields))
|
| 289 |
+
|
| 290 |
+
# Add empty line between sentences
|
| 291 |
conllu_lines.append("")
|
| 292 |
+
|
| 293 |
return "\n".join(conllu_lines)
|
| 294 |
|
| 295 |
+
def parse_text_with_lesbian_greek(text: str) -> str:
|
| 296 |
+
"""Parse Lesbian Greek text using custom Stanza models and return CoNLL-U format"""
|
| 297 |
+
global LESBIAN_GREEK_MODEL
|
| 298 |
+
|
| 299 |
+
if LESBIAN_GREEK_MODEL is None:
|
| 300 |
+
return "Error: Lesbian Greek model not loaded. Please try refreshing the page."
|
| 301 |
+
|
| 302 |
+
if not text.strip():
|
| 303 |
+
return "Error: Please enter some text to parse."
|
| 304 |
+
|
| 305 |
+
try:
|
| 306 |
+
print(f"Processing Lesbian Greek text: {text[:50]}...")
|
| 307 |
+
# Process the text
|
| 308 |
+
doc = LESBIAN_GREEK_MODEL(text)
|
| 309 |
+
|
| 310 |
+
# Convert to CoNLL-U format manually
|
| 311 |
+
conllu_output = stanza_doc_to_conllu(doc)
|
| 312 |
+
print("CoNLL-U conversion successful!")
|
| 313 |
+
return conllu_output
|
| 314 |
+
|
| 315 |
+
except Exception as e:
|
| 316 |
+
error_msg = f"Error processing text: {str(e)}"
|
| 317 |
+
print(error_msg)
|
| 318 |
+
traceback.print_exc()
|
| 319 |
+
return error_msg
|
| 320 |
+
|
| 321 |
+
def conllu_to_dataframe(conllu_text: str) -> pd.DataFrame:
|
| 322 |
+
"""Convert CoNLL-U text to pandas DataFrame for visualization"""
|
| 323 |
+
if conllu_text.startswith("Error"):
|
| 324 |
+
return pd.DataFrame()
|
| 325 |
+
|
| 326 |
+
try:
|
| 327 |
+
lines = conllu_text.strip().split('\n')
|
| 328 |
+
data = []
|
| 329 |
+
|
| 330 |
+
for line in lines:
|
| 331 |
+
# Skip comments and empty lines
|
| 332 |
+
if line.startswith('#') or not line.strip():
|
| 333 |
+
continue
|
| 334 |
+
|
| 335 |
+
# Parse CoNLL-U format
|
| 336 |
parts = line.split('\t')
|
| 337 |
if len(parts) >= 10:
|
| 338 |
+
data.append({
|
| 339 |
+
'ID': parts[0],
|
| 340 |
+
'FORM': parts[1],
|
| 341 |
+
'LEMMA': parts[2],
|
| 342 |
+
'UPOS': parts[3],
|
| 343 |
+
'XPOS': parts[4],
|
| 344 |
+
'FEATS': parts[5],
|
| 345 |
+
'HEAD': parts[6],
|
| 346 |
+
'DEPREL': parts[7],
|
| 347 |
+
'DEPS': parts[8],
|
| 348 |
+
'MISC': parts[9]
|
| 349 |
})
|
| 350 |
+
|
| 351 |
+
return pd.DataFrame(data)
|
| 352 |
+
|
| 353 |
+
except Exception as e:
|
| 354 |
+
print(f"Error creating dataframe: {e}")
|
| 355 |
+
return pd.DataFrame()
|
| 356 |
+
|
| 357 |
+
def create_dependency_tree_svg(df: pd.DataFrame) -> str:
|
| 358 |
+
"""Create an SVG dependency tree visualization similar to dep2pict style with sentence navigation"""
|
| 359 |
+
if df.empty:
|
| 360 |
+
return "<p>No data to visualize</p>"
|
| 361 |
+
|
| 362 |
+
try:
|
| 363 |
+
# Parse sentences from dataframe (sentences are separated by sentence IDs in comments)
|
| 364 |
+
sentences = []
|
| 365 |
+
current_sentence = []
|
| 366 |
+
|
| 367 |
+
# Group by sentences - assuming sentence breaks when ID resets to 1 or we see a new sentence
|
| 368 |
+
for idx, row in df.iterrows():
|
| 369 |
+
word_id = int(row['ID'])
|
| 370 |
+
if word_id == 1 and current_sentence: # New sentence starting
|
| 371 |
+
sentences.append(current_sentence)
|
| 372 |
+
current_sentence = []
|
| 373 |
+
current_sentence.append(row)
|
| 374 |
+
|
| 375 |
+
if current_sentence: # Add the last sentence
|
| 376 |
+
sentences.append(current_sentence)
|
| 377 |
+
|
| 378 |
+
if not sentences:
|
| 379 |
+
sentences = [df.to_dict('records')] # Fallback: treat all as one sentence
|
| 380 |
+
|
| 381 |
+
# Create multi-sentence visualization with navigation
|
| 382 |
+
if len(sentences) > 1:
|
| 383 |
+
return create_multi_sentence_svg(sentences)
|
| 384 |
+
else:
|
| 385 |
+
return create_single_sentence_svg(sentences[0])
|
| 386 |
+
|
| 387 |
+
except Exception as e:
|
| 388 |
+
return f"<p>Error creating visualization: {str(e)}</p>"
|
| 389 |
+
|
| 390 |
+
def create_multi_sentence_svg(sentences):
|
| 391 |
+
"""Create SVG with sentence navigation for multiple sentences"""
|
| 392 |
+
sentence_svgs = []
|
| 393 |
+
|
| 394 |
+
for i, sentence_data in enumerate(sentences):
|
| 395 |
+
sentence_df = pd.DataFrame(sentence_data)
|
| 396 |
+
svg_content = create_single_sentence_svg(sentence_data, sentence_num=i+1, total_sentences=len(sentences))
|
| 397 |
+
sentence_svgs.append(svg_content)
|
| 398 |
+
|
| 399 |
+
# Escape the SVG content for JavaScript
|
| 400 |
+
escaped_svgs = []
|
| 401 |
+
for svg in sentence_svgs:
|
| 402 |
+
escaped = svg.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n').replace('\r', '\\r')
|
| 403 |
+
escaped_svgs.append(f'"{escaped}"')
|
| 404 |
+
|
| 405 |
+
# Create the main container with navigation
|
| 406 |
+
container_id = "sentence_container"
|
| 407 |
+
nav_html = f"""
|
| 408 |
+
<div id="{container_id}" style="border: 1px solid #ddd; padding: 10px; background: white;">
|
| 409 |
+
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px; padding: 5px; background: #f8f9fa; border-radius: 5px;">
|
| 410 |
+
<button onclick="previousSentence()" id="prevBtn" style="padding: 5px 10px; background: #007bff; color: white; border: none; border-radius: 3px; cursor: pointer;">← Previous</button>
|
| 411 |
+
<span id="sentenceCounter" style="font-weight: bold; color: #333;">Sentence 1 of {len(sentences)}</span>
|
| 412 |
+
<button onclick="nextSentence()" id="nextBtn" style="padding: 5px 10px; background: #007bff; color: white; border: none; border-radius: 3px; cursor: pointer;">Next →</button>
|
| 413 |
+
</div>
|
| 414 |
+
<div id="sentenceDisplay">
|
| 415 |
+
{sentence_svgs[0]}
|
| 416 |
+
</div>
|
| 417 |
+
</div>
|
| 418 |
+
|
| 419 |
+
<script>
|
| 420 |
+
(function() {{
|
| 421 |
+
let currentSentence = 0;
|
| 422 |
+
const sentences = [{', '.join(escaped_svgs)}];
|
| 423 |
+
const totalSentences = {len(sentences)};
|
| 424 |
+
|
| 425 |
+
function updateDisplay() {{
|
| 426 |
+
document.getElementById('sentenceDisplay').innerHTML = sentences[currentSentence];
|
| 427 |
+
document.getElementById('sentenceCounter').textContent = 'Sentence ' + (currentSentence + 1) + ' of ' + totalSentences;
|
| 428 |
+
|
| 429 |
+
const prevBtn = document.getElementById('prevBtn');
|
| 430 |
+
const nextBtn = document.getElementById('nextBtn');
|
| 431 |
+
|
| 432 |
+
if (prevBtn) {{
|
| 433 |
+
prevBtn.disabled = currentSentence === 0;
|
| 434 |
+
prevBtn.style.opacity = currentSentence === 0 ? '0.5' : '1';
|
| 435 |
+
}}
|
| 436 |
+
if (nextBtn) {{
|
| 437 |
+
nextBtn.disabled = currentSentence === totalSentences - 1;
|
| 438 |
+
nextBtn.style.opacity = currentSentence === totalSentences - 1 ? '0.5' : '1';
|
| 439 |
+
}}
|
| 440 |
+
}}
|
| 441 |
+
|
| 442 |
+
window.nextSentence = function() {{
|
| 443 |
+
if (currentSentence < totalSentences - 1) {{
|
| 444 |
+
currentSentence++;
|
| 445 |
+
updateDisplay();
|
| 446 |
+
}}
|
| 447 |
+
}};
|
| 448 |
+
|
| 449 |
+
window.previousSentence = function() {{
|
| 450 |
+
if (currentSentence > 0) {{
|
| 451 |
+
currentSentence--;
|
| 452 |
+
updateDisplay();
|
| 453 |
+
}}
|
| 454 |
+
}};
|
| 455 |
+
|
| 456 |
+
// Initialize
|
| 457 |
+
updateDisplay();
|
| 458 |
+
}})();
|
| 459 |
+
</script>
|
| 460 |
+
"""
|
| 461 |
+
|
| 462 |
+
return nav_html
|
| 463 |
+
|
| 464 |
+
def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
|
| 465 |
+
"""Create SVG for a single sentence with full morphological annotations"""
|
| 466 |
+
try:
|
| 467 |
+
# Convert to DataFrame if it's a list of dicts
|
| 468 |
+
if isinstance(sentence_data, list):
|
| 469 |
+
df = pd.DataFrame(sentence_data)
|
| 470 |
+
else:
|
| 471 |
+
df = sentence_data
|
| 472 |
+
|
| 473 |
+
# Calculate optimal spacing and dimensions
|
| 474 |
+
word_count = len(df)
|
| 475 |
+
base_word_width = 100
|
| 476 |
+
min_spacing = 30
|
| 477 |
+
word_spacing = max(base_word_width, (word_count * base_word_width + min_spacing * (word_count - 1)) / word_count)
|
| 478 |
+
|
| 479 |
+
width = max(800, word_count * word_spacing + 100)
|
| 480 |
+
height = 500 # Increased height for more annotations
|
| 481 |
+
|
| 482 |
+
# Text positioning
|
| 483 |
+
word_y = height - 120
|
| 484 |
+
pos_y = word_y + 20
|
| 485 |
+
features_start_y = pos_y + 15
|
| 486 |
+
|
| 487 |
+
# Colors (professional palette)
|
| 488 |
+
deprel_colors = {
|
| 489 |
+
'root': '#000000', 'nsubj': '#2980b9', 'obj': '#27ae60', 'det': '#e67e22',
|
| 490 |
+
'amod': '#8e44ad', 'nmod': '#16a085', 'case': '#34495e', 'punct': '#7f8c8d',
|
| 491 |
+
'cc': '#d35400', 'conj': '#2c3e50', 'cop': '#e74c3c', 'mark': '#9b59b6',
|
| 492 |
+
'csubj': '#3498db', 'xcomp': '#1abc9c', 'ccomp': '#f39c12', 'advcl': '#e91e63',
|
| 493 |
+
'advmod': '#9c27b0', 'obl': '#795548', 'iobj': '#607d8b', 'fixed': '#ff5722',
|
| 494 |
+
'aux': '#ff9800', 'acl': '#4caf50', 'appos': '#673ab7', 'compound': '#009688'
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
svg_parts = [
|
| 498 |
+
f'<svg width="{width}" height="{height}" xmlns="http://www.w3.org/2000/svg" style="background: white; border: 1px solid #eee;">',
|
| 499 |
+
'<defs>'
|
| 500 |
+
]
|
| 501 |
+
|
| 502 |
+
# Create arrowhead markers for each relation type (thinner arrows)
|
| 503 |
+
for deprel, color in deprel_colors.items():
|
| 504 |
+
marker_id = f"arrow_{deprel}"
|
| 505 |
+
svg_parts.append(
|
| 506 |
+
f'<marker id="{marker_id}" markerWidth="4" markerHeight="4" '
|
| 507 |
+
f'markerUnits="userSpaceOnUse" orient="auto" refX="3.5" refY="2">'
|
| 508 |
+
f'<path d="M0,0 L4,2 L0,4 Z" fill="{color}"/>'
|
| 509 |
+
f'</marker>'
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
svg_parts.append('</defs>')
|
| 513 |
+
svg_parts.append('<g>')
|
| 514 |
+
|
| 515 |
+
# Calculate word positions
|
| 516 |
+
word_positions = {}
|
| 517 |
+
for idx, row in df.iterrows():
|
| 518 |
+
word_id = int(row['ID'])
|
| 519 |
+
word_x = 50 + (word_id - 1) * word_spacing
|
| 520 |
+
word_positions[word_id] = word_x
|
| 521 |
+
|
| 522 |
+
# Draw dependency arcs with improved collision detection
|
| 523 |
+
used_spans = []
|
| 524 |
+
|
| 525 |
+
for idx, row in df.iterrows():
|
| 526 |
+
word_id = int(row['ID'])
|
| 527 |
+
head_id = int(row['HEAD']) if row['HEAD'] != '0' else 0
|
| 528 |
+
deprel = row['DEPREL']
|
| 529 |
+
|
| 530 |
+
if head_id == 0: # Root dependency
|
| 531 |
+
word_x = word_positions[word_id]
|
| 532 |
+
color = deprel_colors.get(deprel, '#000000')
|
| 533 |
+
|
| 534 |
+
svg_parts.append(
|
| 535 |
+
f'<line x1="{word_x}" y1="{word_y - 15}" x2="{word_x}" y2="50" '
|
| 536 |
+
f'stroke="{color}" stroke-width="1.5"/>'
|
| 537 |
+
)
|
| 538 |
+
|
| 539 |
+
root_label_y = (word_y - 15 + 50) / 2
|
| 540 |
+
# White background for ROOT label
|
| 541 |
+
svg_parts.append(
|
| 542 |
+
f'<rect x="{word_x - 15}" y="{root_label_y - 8}" '
|
| 543 |
+
f'width="30" height="14" fill="white" stroke="{color}" stroke-width="1" rx="2"/>'
|
| 544 |
+
)
|
| 545 |
+
svg_parts.append(
|
| 546 |
+
f'<text x="{word_x}" y="{root_label_y + 2}" text-anchor="middle" fill="{color}" '
|
| 547 |
+
f'font-family="Arial, sans-serif" font-size="8" font-weight="bold">ROOT</text>'
|
| 548 |
+
)
|
| 549 |
+
else:
|
| 550 |
+
if head_id in word_positions:
|
| 551 |
+
word_x = word_positions[word_id]
|
| 552 |
+
head_x = word_positions[head_id]
|
| 553 |
+
|
| 554 |
+
# Advanced collision detection
|
| 555 |
+
span_start = min(word_id, head_id)
|
| 556 |
+
span_end = max(word_id, head_id)
|
| 557 |
+
current_span = (span_start, span_end)
|
| 558 |
+
|
| 559 |
+
level = 0
|
| 560 |
+
conflict_found = True
|
| 561 |
+
|
| 562 |
+
while conflict_found:
|
| 563 |
+
conflict_found = False
|
| 564 |
+
for existing_span, existing_level in used_spans:
|
| 565 |
+
if existing_level == level:
|
| 566 |
+
if not (span_end < existing_span[0] or span_start > existing_span[1]):
|
| 567 |
+
conflict_found = True
|
| 568 |
+
level += 1
|
| 569 |
+
break
|
| 570 |
+
|
| 571 |
+
used_spans.append((current_span, level))
|
| 572 |
+
|
| 573 |
+
# Calculate arc
|
| 574 |
+
span_distance = abs(head_x - word_x)
|
| 575 |
+
base_height = min(40 + span_distance * 0.15, 100)
|
| 576 |
+
arc_height = base_height + level * 35
|
| 577 |
+
|
| 578 |
+
color = deprel_colors.get(deprel, '#000000')
|
| 579 |
+
marker_id = f"arrow_{deprel}" if deprel in deprel_colors else "arrow_root"
|
| 580 |
+
|
| 581 |
+
mid_x = (word_x + head_x) / 2
|
| 582 |
+
control_y = word_y - arc_height
|
| 583 |
+
|
| 584 |
+
path = f'M {word_x} {word_y - 15} Q {mid_x} {control_y} {head_x} {word_y - 15}'
|
| 585 |
+
|
| 586 |
+
svg_parts.append(
|
| 587 |
+
f'<path d="{path}" stroke="{color}" stroke-width="1.5" '
|
| 588 |
+
f'fill="none" marker-end="url(#{marker_id})"/>'
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
# Label with white background window
|
| 592 |
+
arc_mid_x = 0.25 * word_x + 0.5 * mid_x + 0.25 * head_x
|
| 593 |
+
arc_mid_y = 0.25 * (word_y - 15) + 0.5 * control_y + 0.25 * (word_y - 15)
|
| 594 |
+
|
| 595 |
+
label_width = len(deprel) * 6 + 8
|
| 596 |
+
svg_parts.append(
|
| 597 |
+
f'<rect x="{arc_mid_x - label_width/2}" y="{arc_mid_y - 8}" '
|
| 598 |
+
f'width="{label_width}" height="14" fill="white" stroke="{color}" stroke-width="1" rx="2"/>'
|
| 599 |
+
)
|
| 600 |
+
|
| 601 |
+
svg_parts.append(
|
| 602 |
+
f'<text x="{arc_mid_x}" y="{arc_mid_y + 2}" text-anchor="middle" fill="{color}" '
|
| 603 |
+
f'font-family="Arial, sans-serif" font-size="8" font-weight="bold">{deprel}</text>'
|
| 604 |
+
)
|
| 605 |
+
|
| 606 |
+
# Draw words and complete morphological annotations
|
| 607 |
+
for idx, row in df.iterrows():
|
| 608 |
+
word_id = int(row['ID'])
|
| 609 |
+
word = row['FORM']
|
| 610 |
+
pos = row['UPOS']
|
| 611 |
+
lemma = row['LEMMA']
|
| 612 |
+
feats = row['FEATS']
|
| 613 |
+
xpos = row['XPOS']
|
| 614 |
+
|
| 615 |
+
word_x = word_positions[word_id]
|
| 616 |
+
|
| 617 |
+
# Main word text
|
| 618 |
+
svg_parts.append(
|
| 619 |
+
f'<text x="{word_x}" y="{word_y}" text-anchor="middle" fill="#000000" '
|
| 620 |
+
f'font-family="Arial, sans-serif" font-size="13" font-weight="bold">{word}</text>'
|
| 621 |
+
)
|
| 622 |
+
|
| 623 |
+
# Morphological annotations
|
| 624 |
+
annotations = []
|
| 625 |
+
|
| 626 |
+
# Universal POS
|
| 627 |
+
if pos and pos != '_':
|
| 628 |
+
annotations.append(f"upos={pos}")
|
| 629 |
+
|
| 630 |
+
# Lemma
|
| 631 |
+
if lemma and lemma != '_' and lemma != word:
|
| 632 |
+
annotations.append(f"lemma={lemma}")
|
| 633 |
+
|
| 634 |
+
# Language-specific POS
|
| 635 |
+
if xpos and xpos != '_':
|
| 636 |
+
annotations.append(f"xpos={xpos}")
|
| 637 |
+
|
| 638 |
+
# Parse and add ALL morphological features
|
| 639 |
+
if feats and feats != '_':
|
| 640 |
+
feat_pairs = feats.split('|') if '|' in feats else [feats]
|
| 641 |
+
for feat in feat_pairs:
|
| 642 |
+
if '=' in feat:
|
| 643 |
+
annotations.append(feat)
|
| 644 |
+
|
| 645 |
+
# Display all annotations
|
| 646 |
+
for i, annotation in enumerate(annotations):
|
| 647 |
+
y_pos = features_start_y + i * 12
|
| 648 |
+
svg_parts.append(
|
| 649 |
+
f'<text x="{word_x}" y="{y_pos}" text-anchor="middle" '
|
| 650 |
+
f'fill="#666666" font-family="Arial, sans-serif" font-size="7">{annotation}</text>'
|
| 651 |
+
)
|
| 652 |
+
|
| 653 |
+
svg_parts.append('</g>')
|
| 654 |
+
svg_parts.append('</svg>')
|
| 655 |
+
|
| 656 |
+
return ''.join(svg_parts)
|
| 657 |
+
|
| 658 |
+
except Exception as e:
|
| 659 |
+
return f"<p>Error creating single sentence visualization: {str(e)}</p>"
|
| 660 |
+
|
| 661 |
+
def create_dependency_visualization(df: pd.DataFrame) -> str:
|
| 662 |
+
"""Create a simple text-based dependency visualization"""
|
| 663 |
+
if df.empty:
|
| 664 |
+
return "No data to visualize"
|
| 665 |
+
|
| 666 |
+
try:
|
| 667 |
+
viz_lines = []
|
| 668 |
+
viz_lines.append("Dependency Parse Visualization:")
|
| 669 |
+
viz_lines.append("=" * 50)
|
| 670 |
+
|
| 671 |
+
for _, row in df.iterrows():
|
| 672 |
+
word = row['FORM']
|
| 673 |
+
pos = row['UPOS']
|
| 674 |
+
deprel = row['DEPREL']
|
| 675 |
+
head_id = row['HEAD']
|
| 676 |
+
|
| 677 |
+
# Find the head word
|
| 678 |
+
if head_id != '0': # Not root
|
| 679 |
+
try:
|
| 680 |
+
head_idx = int(head_id) - 1
|
| 681 |
+
if 0 <= head_idx < len(df):
|
| 682 |
+
head_word = df.iloc[head_idx]['FORM']
|
| 683 |
+
viz_lines.append(f"{word} ({pos}) --{deprel}--> {head_word}")
|
| 684 |
+
else:
|
| 685 |
+
viz_lines.append(f"{word} ({pos}) --{deprel}--> [OUT_OF_RANGE]")
|
| 686 |
+
except (ValueError, IndexError):
|
| 687 |
+
viz_lines.append(f"{word} ({pos}) --{deprel}--> [ERROR]")
|
| 688 |
+
else:
|
| 689 |
+
viz_lines.append(f"{word} ({pos}) --{deprel}--> ROOT")
|
| 690 |
+
|
| 691 |
+
return "\n".join(viz_lines)
|
| 692 |
+
|
| 693 |
+
except Exception as e:
|
| 694 |
+
return f"Error creating visualization: {str(e)}"
|
| 695 |
+
|
| 696 |
+
def process_text(text: str):
|
| 697 |
+
"""Main processing function that returns all outputs"""
|
| 698 |
if not text.strip():
|
| 699 |
+
empty_df = pd.DataFrame()
|
| 700 |
+
return "Please enter some Lesbian Greek text to parse.", empty_df, "No data to display", "<p>No data to visualize</p>"
|
| 701 |
+
|
| 702 |
+
# Parse with custom Lesbian Greek model
|
| 703 |
+
print(f"Starting to process: {text[:30]}...")
|
| 704 |
+
conllu_output = parse_text_with_lesbian_greek(text)
|
| 705 |
+
|
| 706 |
+
if conllu_output.startswith("Error"):
|
| 707 |
+
empty_df = pd.DataFrame()
|
| 708 |
+
return conllu_output, empty_df, "Error in parsing", "<p>Error in parsing</p>"
|
| 709 |
+
|
| 710 |
+
# Convert to DataFrame
|
| 711 |
try:
|
| 712 |
+
df = conllu_to_dataframe(conllu_output)
|
| 713 |
+
|
| 714 |
+
if df.empty:
|
| 715 |
+
return conllu_output, df, "No tokens found", "<p>No tokens found</p>"
|
| 716 |
+
|
| 717 |
+
# Create visualizations
|
| 718 |
+
text_visualization = create_dependency_visualization(df)
|
| 719 |
+
svg_visualization = create_dependency_tree_svg(df)
|
| 720 |
+
|
| 721 |
+
return conllu_output, df, text_visualization, svg_visualization
|
| 722 |
+
|
| 723 |
except Exception as e:
|
| 724 |
+
error_msg = f"Error creating outputs: {str(e)}"
|
| 725 |
+
print(error_msg)
|
| 726 |
+
empty_df = pd.DataFrame()
|
| 727 |
+
return conllu_output, empty_df, error_msg, f"<p>{error_msg}</p>"
|
| 728 |
|
| 729 |
+
# Initialize Lesbian Greek model
|
| 730 |
+
print("Initializing Lesbian Greek Stanza model...")
|
| 731 |
+
model_loaded, status_message = initialize_lesbian_greek_model()
|
|
|
|
|
|
|
|
|
|
| 732 |
|
| 733 |
+
print(f"Model initialization result: {model_loaded}")
|
| 734 |
+
print(f"Status: {status_message}")
|
| 735 |
+
|
| 736 |
+
# Create Gradio interface
|
| 737 |
def create_gradio_app():
|
| 738 |
+
with gr.Blocks(title="Lesbian Greek Morphosyntactic Parser", theme=gr.themes.Soft()) as app:
|
| 739 |
+
gr.Markdown("""
|
| 740 |
+
# Lesbian Greek Morphosyntactic Parser
|
| 741 |
+
|
| 742 |
+
This tool uses custom Stanza models trained specifically for the **Lesbian dialect of Greek**
|
| 743 |
+
(spoken on the island of Lesbos). The models provide:
|
| 744 |
+
|
| 745 |
+
- **Tokenization**: Splits text into tokens
|
| 746 |
+
- **POS Tagging**: Part-of-speech classification
|
| 747 |
+
- **Lemmatization**: Base form identification
|
| 748 |
+
- **Dependency Parsing**: Syntactic relationship analysis
|
| 749 |
+
- **CoNLL-U Output**: Standard linguistic annotation format
|
| 750 |
+
|
| 751 |
+
## About the Models
|
| 752 |
+
|
| 753 |
+
These models were trained on a curated treebank of 540 sentences from both oral and written
|
| 754 |
+
sources collected from various villages of Lesbos, including Agra, Chidira, Eressos,
|
| 755 |
+
Pterounta, Mesotopos, and Parakoila.
|
| 756 |
+
|
| 757 |
+
**Citation**: Bompolas, S., Markantonatou, S., Ralli, A., & Anastasopoulos, A. (2025).
|
| 758 |
+
Crossing Dialectal Boundaries: Building a Treebank for the Dialect of Lesbos through
|
| 759 |
+
Knowledge Transfer from Standard Modern Greek.
|
| 760 |
+
|
| 761 |
+
Enter your Lesbian Greek text below to get started!
|
| 762 |
+
""")
|
| 763 |
+
|
| 764 |
+
# Show model status
|
| 765 |
+
if model_loaded:
|
| 766 |
+
if "default Greek" in status_message:
|
| 767 |
+
gr.Markdown(f"""
|
| 768 |
+
⚠️ **Model Status**: {status_message}
|
| 769 |
+
|
| 770 |
+
The custom Lesbian Greek models could not be loaded, so default Greek models are being used.
|
| 771 |
+
Results may not be optimized for the Lesbian dialect.
|
| 772 |
+
""")
|
| 773 |
+
else:
|
| 774 |
+
gr.Markdown(f"""
|
| 775 |
+
✅ **Model Status**: {status_message}
|
| 776 |
+
|
| 777 |
+
Custom Lesbian Greek models loaded successfully!
|
| 778 |
+
""")
|
| 779 |
+
else:
|
| 780 |
+
gr.Markdown(f"""
|
| 781 |
+
❌ **Model Loading Error**: {status_message}
|
| 782 |
+
|
| 783 |
+
The models could not be loaded. This may be due to:
|
| 784 |
+
- Network issues downloading the models
|
| 785 |
+
- Missing dependencies (transformers library)
|
| 786 |
+
- Insufficient memory or storage
|
| 787 |
+
- Model compatibility issues
|
| 788 |
+
|
| 789 |
+
Please try refreshing the page or contact the developers.
|
| 790 |
+
""")
|
| 791 |
+
|
| 792 |
+
with gr.Row():
|
| 793 |
+
with gr.Column():
|
| 794 |
+
text_input = gr.Textbox(
|
| 795 |
+
label="Lesbian Greek Text Input",
|
| 796 |
+
placeholder="Εισάγετε το κείμενο στη Λεσβιακή διάλεκτο..." if model_loaded else "Models not loaded - please refresh page",
|
| 797 |
+
lines=4,
|
| 798 |
+
value="Τα παιδιά πάντ στο κήπ." if model_loaded else "",
|
| 799 |
+
interactive=model_loaded
|
| 800 |
+
)
|
| 801 |
+
|
| 802 |
+
parse_button = gr.Button(
|
| 803 |
+
"Parse Lesbian Greek Text",
|
| 804 |
+
variant="primary",
|
| 805 |
+
size="lg",
|
| 806 |
+
interactive=model_loaded
|
| 807 |
+
)
|
| 808 |
+
|
| 809 |
+
with gr.Row():
|
| 810 |
+
with gr.Column():
|
| 811 |
+
gr.Markdown("### Interactive Dependency Tree")
|
| 812 |
+
dependency_tree_viz = gr.HTML(
|
| 813 |
+
label="Visual Dependency Tree",
|
| 814 |
+
value="<p>Enter text and click parse to see the dependency tree visualization</p>"
|
| 815 |
+
)
|
| 816 |
+
|
| 817 |
+
with gr.Row():
|
| 818 |
+
with gr.Column():
|
| 819 |
+
gr.Markdown("### CoNLL-U Output")
|
| 820 |
+
conllu_output = gr.Textbox(
|
| 821 |
+
label="CoNLL-U Format",
|
| 822 |
+
lines=10,
|
| 823 |
+
max_lines=20,
|
| 824 |
+
show_copy_button=True,
|
| 825 |
+
info="Raw CoNLL-U format output optimized for Lesbian Greek dialect"
|
| 826 |
+
)
|
| 827 |
+
|
| 828 |
+
with gr.Row():
|
| 829 |
+
with gr.Column():
|
| 830 |
+
gr.Markdown("### Parsed Data Table")
|
| 831 |
+
data_table = gr.Dataframe(
|
| 832 |
+
label="Token Analysis",
|
| 833 |
+
interactive=False,
|
| 834 |
+
wrap=True
|
| 835 |
+
)
|
| 836 |
+
|
| 837 |
+
with gr.Row():
|
| 838 |
+
with gr.Column():
|
| 839 |
+
gr.Markdown("### Text-based Dependency Structure")
|
| 840 |
+
dependency_viz = gr.Textbox(
|
| 841 |
+
label="Dependency Relationships",
|
| 842 |
+
lines=8,
|
| 843 |
+
max_lines=15,
|
| 844 |
+
show_copy_button=True,
|
| 845 |
+
info="Text-based visualization of syntactic dependencies"
|
| 846 |
+
)
|
| 847 |
+
|
| 848 |
+
# Event handling
|
| 849 |
+
if model_loaded:
|
| 850 |
+
parse_button.click(
|
| 851 |
+
fn=process_text,
|
| 852 |
+
inputs=[text_input],
|
| 853 |
+
outputs=[conllu_output, data_table, dependency_viz, dependency_tree_viz]
|
| 854 |
+
)
|
| 855 |
+
|
| 856 |
+
# Also trigger on Enter in text input
|
| 857 |
+
text_input.submit(
|
| 858 |
+
fn=process_text,
|
| 859 |
+
inputs=[text_input],
|
| 860 |
+
outputs=[conllu_output, data_table, dependency_viz, dependency_tree_viz]
|
| 861 |
+
)
|
| 862 |
+
|
| 863 |
+
# Add Lesbian Greek examples (if available)
|
| 864 |
+
if model_loaded:
|
| 865 |
+
gr.Markdown("### Example Lesbian Greek Texts")
|
| 866 |
+
examples = [
|
| 867 |
+
["Τα παιδιά πάντ στο κήπ."],
|
| 868 |
+
["Η γάτα κάθεται στο τραπέζ."],
|
| 869 |
+
["Ο ήλιος λάμπει στον ουρανό."],
|
| 870 |
+
["Η θάλασσα είναι γαλάζια και όμορφη."],
|
| 871 |
+
]
|
| 872 |
+
|
| 873 |
+
gr.Examples(
|
| 874 |
+
examples=examples,
|
| 875 |
+
inputs=[text_input],
|
| 876 |
+
outputs=[conllu_output, data_table, dependency_viz, dependency_tree_viz],
|
| 877 |
+
fn=process_text,
|
| 878 |
+
cache_examples=False
|
| 879 |
+
)
|
| 880 |
+
|
| 881 |
+
gr.Markdown("""
|
| 882 |
+
### Visualization Legend
|
| 883 |
+
|
| 884 |
+
The **Interactive Dependency Tree** shows:
|
| 885 |
+
- **Words** in bold at the bottom with their position numbers
|
| 886 |
+
- **POS tags** in gray below each word
|
| 887 |
+
- **Dependency arcs** as curved lines with arrows pointing to heads
|
| 888 |
+
- **Dependency relations** labeled on the arcs
|
| 889 |
+
- **Color coding** for different dependency types:
|
| 890 |
+
- Red: ROOT relations
|
| 891 |
+
- Blue: Subject relations (nsubj)
|
| 892 |
+
- Green: Object relations (obj)
|
| 893 |
+
- Orange: Determiners (det)
|
| 894 |
+
- Purple: Adjective modifiers (amod)
|
| 895 |
+
- And more...
|
| 896 |
+
|
| 897 |
+
### About CoNLL-U Format
|
| 898 |
+
|
| 899 |
+
The CoNLL-U format includes these fields for each token:
|
| 900 |
+
- **ID**: Token index
|
| 901 |
+
- **FORM**: Word form or punctuation symbol
|
| 902 |
+
- **LEMMA**: Lemma or stem of word form
|
| 903 |
+
- **UPOS**: Universal part-of-speech tag
|
| 904 |
+
- **XPOS**: Language-specific part-of-speech tag
|
| 905 |
+
- **FEATS**: Morphological features
|
| 906 |
+
- **HEAD**: Head of the current word
|
| 907 |
+
- **DEPREL**: Dependency relation to the head
|
| 908 |
+
- **DEPS**: Enhanced dependency graph
|
| 909 |
+
- **MISC**: Miscellaneous annotations
|
| 910 |
+
|
| 911 |
+
### Resources
|
| 912 |
+
- [Lesbian Greek Models on Hugging Face](https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model)
|
| 913 |
+
- [UD_Greek-Lesbian Treebank](https://github.com/UniversalDependencies/UD_Greek-Lesbian)
|
| 914 |
+
- [Stanza Documentation](https://stanfordnlp.github.io/stanza/)
|
| 915 |
+
""")
|
| 916 |
+
|
| 917 |
+
return app
|
| 918 |
+
|
| 919 |
+
# Create and launch the app
|
| 920 |
+
if __name__ == "__main__":
|
| 921 |
+
print("Creating Gradio app...")
|
| 922 |
app = create_gradio_app()
|
| 923 |
+
print("Launching app...")
|
| 924 |
+
app.launch()
|