sbompolas commited on
Commit
fb5b190
·
verified ·
1 Parent(s): fe64a8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -217
app.py CHANGED
@@ -1,321 +1,237 @@
1
  import gradio as gr
2
  import stanza
3
  import pandas as pd
4
- import sys
5
  import traceback
6
- import os
7
- import tempfile
8
  import requests
9
  from pathlib import Path
10
  import json
 
11
 
12
  # Global variables to store the pipelines
13
  LESBIAN_MODELS = {}
14
  MODEL_VARIANTS = {
15
- "Lesbian-only": "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
16
- "Lesbian-synthetic-data": "sbompolas/NGUD-Lesbian-Morphosyntactic-Model"
17
  }
18
 
19
-
20
  def download_model_file(url, filename):
21
  """Download a model file from Hugging Face"""
22
  try:
23
- print(f"Downloading {filename}...")
24
  response = requests.get(url, stream=True)
25
  response.raise_for_status()
26
  with open(filename, 'wb') as f:
27
  for chunk in response.iter_content(chunk_size=8192):
28
  f.write(chunk)
29
- print(f"Successfully downloaded {filename}")
30
  return True
31
  except Exception as e:
32
- print(f"Failed to download {filename}: {e}")
33
  return False
34
 
35
-
36
  def initialize_lesbian_greek_model():
37
- """Download and initialize both Lesbian-only and Lesbian-synthetic-data models"""
38
  try:
39
- print("Initializing both Lesbian-only and Lesbian-synthetic-data pipelines...")
40
  base_dir = Path("./models")
41
  base_dir.mkdir(exist_ok=True)
42
  for variant_name, repo in MODEL_VARIANTS.items():
43
- print(f"\n→ Setting up '{variant_name}' from repo {repo}...")
44
- out_dir = base_dir / variant_name
45
  out_dir.mkdir(parents=True, exist_ok=True)
46
 
47
- # Model files to fetch
48
- model_files = {
49
- "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
50
- "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
51
- "pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
52
- "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
53
  }
54
-
55
- # Download files if missing
56
- for fname, url in model_files.items():
57
- tgt = out_dir / fname
58
  if not tgt.exists():
59
  if not download_model_file(url, str(tgt)):
60
- return False, f"Failed to download {fname} for {variant_name}"
61
 
62
- # Build the stanza pipeline directly
63
  config = {
64
  'processors': 'tokenize,pos,lemma,depparse',
65
  'lang': 'el',
66
  'use_gpu': False,
67
  'verbose': False,
68
- 'tokenize_model_path': str(out_dir / "tokenizer.pt"),
69
- 'pos_model_path': str(out_dir / "pos.pt"),
70
- 'lemma_model_path': str(out_dir / "lemmatizer.pt"),
71
- 'depparse_model_path': str(out_dir / "depparse.pt")
72
  }
73
-
74
  try:
75
- MODEL = stanza.Pipeline(**config)
76
- LESBIAN_MODELS[variant_name] = MODEL
77
- print(f"Loaded pipeline for {variant_name}")
78
  except Exception as e:
79
- print(f"⚠️ Could not load {variant_name}: {e}")
80
  return False, f"Pipeline init error for {variant_name}: {e}"
81
 
82
- return True, "Both variants loaded successfully"
83
  except Exception as e:
84
- print(f"Initialization failure: {e}")
85
  traceback.print_exc()
86
  return False, str(e)
87
 
88
-
89
  def stanza_doc_to_conllu(doc) -> str:
90
  """Convert Stanza Document to CoNLL-U format"""
91
- conllu_lines = []
92
- for sent_idx, sentence in enumerate(doc.sentences):
93
- conllu_lines.append(f"# sent_id = {sent_idx+1}")
94
- conllu_lines.append(f"# text = {sentence.text}")
95
- for word in sentence.words:
96
  fields = [
97
- str(word.id),
98
- word.text,
99
- word.lemma or "_",
100
- word.upos or "_",
101
- word.xpos or "_",
102
- word.feats or "_",
103
- str(word.head) if word.head is not None else "0",
104
- word.deprel or "_",
105
  "_",
106
  "_"
107
  ]
108
- conllu_lines.append("\t".join(fields))
109
- conllu_lines.append("")
110
- return "\n".join(conllu_lines)
111
-
112
-
113
- def parse_and_conllu(text: str, variant: str) -> str:
114
- """Parse text with chosen variant and return CoNLL-U"""
115
- pipeline = LESBIAN_MODELS.get(variant)
116
- if not pipeline:
117
- return f"Error: model {variant} not loaded."
118
-
119
- if not text.strip():
120
- return "Error: Please enter some text to parse."
121
-
122
- try:
123
- doc = pipeline(text)
124
- return stanza_doc_to_conllu(doc)
125
- except Exception as e:
126
- return f"Error processing text: {e}"
127
-
128
 
129
  def conllu_to_dataframe(conllu_text: str) -> pd.DataFrame:
130
  """Convert CoNLL-U text to pandas DataFrame"""
131
- if conllu_text.startswith("Error"):
132
  return pd.DataFrame()
133
-
134
- data = []
135
  for line in conllu_text.splitlines():
136
  if not line or line.startswith("#"):
137
  continue
138
  parts = line.split("\t")
139
  if len(parts) >= 10:
140
- data.append({
141
  'ID': parts[0], 'FORM': parts[1], 'LEMMA': parts[2],
142
  'UPOS': parts[3], 'XPOS': parts[4], 'FEATS': parts[5],
143
  'HEAD': parts[6], 'DEPREL': parts[7], 'DEPS': parts[8], 'MISC': parts[9]
144
  })
145
- return pd.DataFrame(data)
146
-
147
 
148
  def create_dependency_visualization(df: pd.DataFrame) -> str:
149
  """Simple text-based dependency display"""
150
  if df.empty:
151
  return "No data to visualize"
152
-
153
- lines = ["Dependency Parse Visualization:", "-"*40]
154
- for _, row in df.iterrows():
155
- word, pos, deprel, head = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
156
- if head != '0':
157
  try:
158
- head_idx = int(head)-1
159
- head_word = df.iloc[head_idx]['FORM']
160
- lines.append(f"{word} ({pos}) --{deprel}--> {head_word}")
161
  except:
162
- lines.append(f"{word} ({pos}) --{deprel}--> [ERROR]")
 
163
  else:
164
- lines.append(f"{word} ({pos}) --{deprel}--> ROOT")
165
- return "\n".join(lines)
166
-
167
-
168
- def create_dependency_tree_svg(df: pd.DataFrame) -> str:
169
- """SVG-based dependency tree with simple slider navigation"""
170
- if df.empty:
171
- return "<p>No data to visualize</p>"
172
-
173
- # group into sentences
174
- sentences, current = [], []
175
- for idx, row in df.iterrows():
176
- wid = int(row['ID'])
177
- if wid == 1 and current:
178
- sentences.append(current)
179
- current = []
180
- current.append(row.to_dict())
181
- if current:
182
- sentences.append(current)
183
-
184
- # render each as an SVG slide
185
- slides = []
186
- for sent in sentences:
187
- svg = create_single_sentence_svg(sent)
188
- slides.append(svg)
189
-
190
- # wrap slides in divs with show/hide logic
191
- slide_divs = "\n".join(
192
- f'<div class="slide" style="display:{"block" if i==0 else "none"}">{svg}</div>'
193
- for i, svg in enumerate(slides)
194
- )
195
-
196
- return f"""
197
- <div id="slider">
198
- {slide_divs}
199
- <button id="prevBtn">← Prev</button>
200
- <button id="nextBtn">Next →</button>
201
- </div>
202
- <script>
203
- (function() {{
204
- const slides = document.querySelectorAll('#slider .slide');
205
- let idx = 0;
206
- document.getElementById('prevBtn').onclick = () => {{
207
- slides[idx].style.display = 'none';
208
- idx = (idx - 1 + slides.length) % slides.length;
209
- slides[idx].style.display = 'block';
210
- }};
211
- document.getElementById('nextBtn').onclick = () => {{
212
- slides[idx].style.display = 'none';
213
- idx = (idx + 1) % slides.length;
214
- slides[idx].style.display = 'block';
215
- }};
216
- }})();
217
- </script>
218
- """
219
-
220
 
221
  def create_single_sentence_svg(sentence_data):
222
- """Generate detailed SVG for one sentence (kept as in original code)"""
223
- # For brevity, I'm reusing your original single-sentence SVG builder.
224
- # Paste the full create_single_sentence_svg implementation here unchanged.
225
- # ...
226
- return "<svg><!-- your detailed SVG here --></svg>"
227
-
228
 
229
  def process_text(text, variant):
230
- """Main entry: returns four outputs for Gradio"""
231
  if not text.strip():
232
- empty_df = pd.DataFrame()
233
- return "Please enter some Lesbian Greek text to parse.", empty_df, "", "<p>No data</p>"
234
 
235
- conllu = parse_and_conllu(text, variant)
236
- if conllu.startswith("Error"):
237
- return conllu, pd.DataFrame(), "", "<p>Error</p>"
238
 
239
- df = conllu_to_dataframe(conllu)
240
- if df.empty:
241
- return conllu, df, "", "<p>No tokens found</p>"
 
242
 
 
 
243
  text_viz = create_dependency_visualization(df)
244
- svg_viz = create_dependency_tree_svg(df)
245
- return conllu, df, text_viz, svg_viz
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
- # Initialize models at startup
249
- print("Initializing pipelines...")
250
- loaded, status_msg = initialize_lesbian_greek_model()
251
- print(f"Loaded={loaded}, Status={status_msg}")
 
 
 
 
 
252
 
 
 
 
253
 
254
  def create_gradio_app():
255
- with gr.Blocks(title="Lesbian Greek Morphosyntactic Parser", theme=gr.themes.Soft()) as app:
256
- gr.Markdown("""
257
- # Lesbian Greek Morphosyntactic Parser
258
-
259
- Pick a model variant, enter your text, and see tokenization, POS, lemmas, and dependencies.
260
- """)
261
 
262
- # show status
263
  if loaded:
264
- gr.Markdown(f"✅ Models loaded: {', '.join(MODEL_VARIANTS.keys())}")
265
  else:
266
- gr.Markdown(f"❌ Loading error: {status_msg}")
267
-
268
- with gr.Row():
269
- with gr.Column():
270
- text_input = gr.Textbox(
271
- label="Lesbian Greek Text Input",
272
- placeholder="Enter Lesbian Greek here...",
273
- lines=4
274
- )
275
- parse_button = gr.Button("Parse Text", variant="primary")
276
-
277
- with gr.Column():
278
- model_selector = gr.Radio(
279
- choices=list(MODEL_VARIANTS.keys()),
280
- value="Lesbian-only",
281
- label="Choose Variant"
282
- )
283
-
284
- with gr.Row():
285
- with gr.Column():
286
- gr.Markdown("### Dependency Tree")
287
- dependency_tree_viz = gr.HTML("<p>Parse to see tree</p>")
288
-
289
- with gr.Row():
290
- with gr.Column():
291
- gr.Markdown("### CoNLL-U Output")
292
- conllu_output = gr.Textbox(lines=10, show_copy_button=True)
293
 
294
  with gr.Row():
295
  with gr.Column():
296
- gr.Markdown("### Token Table")
297
- data_table = gr.Dataframe(interactive=False)
298
-
299
- with gr.Row():
300
  with gr.Column():
301
- gr.Markdown("### Text-based Dependencies")
302
- dependency_viz = gr.Textbox(lines=8, show_copy_button=True)
303
-
304
- if loaded:
305
- parse_button.click(
306
- fn=process_text,
307
- inputs=[text_input, model_selector],
308
- outputs=[conllu_output, data_table, dependency_viz, dependency_tree_viz]
309
- )
310
- text_input.submit(
311
- fn=process_text,
312
- inputs=[text_input, model_selector],
313
- outputs=[conllu_output, data_table, dependency_viz, dependency_tree_viz]
314
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
  return app
317
 
318
-
319
  if __name__ == "__main__":
320
  app = create_gradio_app()
321
  app.launch()
 
1
  import gradio as gr
2
  import stanza
3
  import pandas as pd
 
4
  import traceback
 
 
5
  import requests
6
  from pathlib import Path
7
  import json
8
+ import os
9
 
10
  # Global variables to store the pipelines
11
  LESBIAN_MODELS = {}
12
  MODEL_VARIANTS = {
13
+ "Lesbian-only": "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
14
+ "Lesbian-synthetic-data": "sbompolas/NGUD-Lesbian-Morphosyntactic-Model"
15
  }
16
 
 
17
  def download_model_file(url, filename):
18
  """Download a model file from Hugging Face"""
19
  try:
 
20
  response = requests.get(url, stream=True)
21
  response.raise_for_status()
22
  with open(filename, 'wb') as f:
23
  for chunk in response.iter_content(chunk_size=8192):
24
  f.write(chunk)
 
25
  return True
26
  except Exception as e:
27
+ print(f"Download failed {filename}: {e}")
28
  return False
29
 
 
30
  def initialize_lesbian_greek_model():
31
+ """Download and initialize both model variants"""
32
  try:
 
33
  base_dir = Path("./models")
34
  base_dir.mkdir(exist_ok=True)
35
  for variant_name, repo in MODEL_VARIANTS.items():
36
+ out_dir = base_dir/variant_name
 
37
  out_dir.mkdir(parents=True, exist_ok=True)
38
 
39
+ # four model files
40
+ files = {
41
+ "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
42
+ "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
43
+ "pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
44
+ "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
45
  }
46
+ # download
47
+ for fn, url in files.items():
48
+ tgt = out_dir/fn
 
49
  if not tgt.exists():
50
  if not download_model_file(url, str(tgt)):
51
+ return False, f"Failed download {fn} for {variant_name}"
52
 
53
+ # build stanza pipeline
54
  config = {
55
  'processors': 'tokenize,pos,lemma,depparse',
56
  'lang': 'el',
57
  'use_gpu': False,
58
  'verbose': False,
59
+ 'tokenize_model_path': str(out_dir/"tokenizer.pt"),
60
+ 'pos_model_path': str(out_dir/"pos.pt"),
61
+ 'lemma_model_path': str(out_dir/"lemmatizer.pt"),
62
+ 'depparse_model_path': str(out_dir/"depparse.pt")
63
  }
 
64
  try:
65
+ pipe = stanza.Pipeline(**config)
66
+ LESBIAN_MODELS[variant_name] = pipe
67
+ print(f"Loaded {variant_name}")
68
  except Exception as e:
 
69
  return False, f"Pipeline init error for {variant_name}: {e}"
70
 
71
+ return True, "Models loaded"
72
  except Exception as e:
 
73
  traceback.print_exc()
74
  return False, str(e)
75
 
 
76
  def stanza_doc_to_conllu(doc) -> str:
77
  """Convert Stanza Document to CoNLL-U format"""
78
+ lines = []
79
+ for sid, sentence in enumerate(doc.sentences, start=1):
80
+ lines.append(f"# sent_id = {sid}")
81
+ lines.append(f"# text = {sentence.text}")
82
+ for w in sentence.words:
83
  fields = [
84
+ str(w.id),
85
+ w.text,
86
+ w.lemma or "_",
87
+ w.upos or "_",
88
+ w.xpos or "_",
89
+ w.feats or "_",
90
+ str(w.head) if w.head is not None else "0",
91
+ w.deprel or "_",
92
  "_",
93
  "_"
94
  ]
95
+ lines.append("\t".join(fields))
96
+ lines.append("")
97
+ return "\n".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def conllu_to_dataframe(conllu_text: str) -> pd.DataFrame:
100
  """Convert CoNLL-U text to pandas DataFrame"""
101
+ if not conllu_text or conllu_text.startswith("Error"):
102
  return pd.DataFrame()
103
+ rows = []
 
104
  for line in conllu_text.splitlines():
105
  if not line or line.startswith("#"):
106
  continue
107
  parts = line.split("\t")
108
  if len(parts) >= 10:
109
+ rows.append({
110
  'ID': parts[0], 'FORM': parts[1], 'LEMMA': parts[2],
111
  'UPOS': parts[3], 'XPOS': parts[4], 'FEATS': parts[5],
112
  'HEAD': parts[6], 'DEPREL': parts[7], 'DEPS': parts[8], 'MISC': parts[9]
113
  })
114
+ return pd.DataFrame(rows)
 
115
 
116
  def create_dependency_visualization(df: pd.DataFrame) -> str:
117
  """Simple text-based dependency display"""
118
  if df.empty:
119
  return "No data to visualize"
120
+ viz = ["Dependency Parse Visualization:", "-"*40]
121
+ for _, r in df.iterrows():
122
+ w, p, d, h = r['FORM'], r['UPOS'], r['DEPREL'], r['HEAD']
123
+ if h != '0':
 
124
  try:
125
+ hw = df.iloc[int(h)-1]['FORM']
 
 
126
  except:
127
+ hw = "[ERROR]"
128
+ viz.append(f"{w} ({p}) --{d}--> {hw}")
129
  else:
130
+ viz.append(f"{w} ({p}) --{d}--> ROOT")
131
+ return "\n".join(viz)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  def create_single_sentence_svg(sentence_data):
134
+ """Detailed SVG builder unchanged—paste your original implementation here."""
135
+ # ... your create_single_sentence_svg code ...
136
+ return "<svg><!-- your SVG here --></svg>"
 
 
 
137
 
138
  def process_text(text, variant):
139
+ """Parse the text, return all outputs including sentence list and initial SVG"""
140
  if not text.strip():
141
+ empty = pd.DataFrame()
142
+ return "Please enter text.", empty, "", [], [], "<p>No data</p>"
143
 
144
+ pipe = LESBIAN_MODELS.get(variant)
145
+ if not pipe:
146
+ return f"Error: model {variant} not loaded.", pd.DataFrame(), "", [], [], "<p>Error</p>"
147
 
148
+ try:
149
+ doc = pipe(text)
150
+ except Exception as e:
151
+ return f"Error parsing: {e}", pd.DataFrame(), "", [], [], "<p>Error</p>"
152
 
153
+ conllu = stanza_doc_to_conllu(doc)
154
+ df = conllu_to_dataframe(conllu)
155
  text_viz = create_dependency_visualization(df)
 
 
156
 
157
+ # build per-sentence data
158
+ sentences = []
159
+ for sent in doc.sentences:
160
+ sent_rows = []
161
+ for w in sent.words:
162
+ sent_rows.append({
163
+ 'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
164
+ 'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
165
+ 'FEATS': w.feats or "_", 'HEAD': w.head or 0, 'DEPREL': w.deprel or "_"
166
+ })
167
+ sentences.append(sent_rows)
168
+
169
+ # dropdown choices and initial svg
170
+ choices = [str(i+1) for i in range(len(sentences))]
171
+ init_svg = create_single_sentence_svg(sentences[0]) if sentences else "<p>No sentences</p>"
172
+
173
+ return conllu, df, text_viz, choices, sentences, init_svg
174
 
175
+ def update_svg(sel, sentences):
176
+ """Return SVG for the selected sentence id"""
177
+ try:
178
+ idx = int(sel) - 1
179
+ if 0 <= idx < len(sentences):
180
+ return create_single_sentence_svg(sentences[idx])
181
+ except:
182
+ pass
183
+ return "<p>Invalid selection</p>"
184
 
185
+ # initialize at startup
186
+ loaded, status = initialize_lesbian_greek_model()
187
+ print(f"Models loaded={loaded}, status={status}")
188
 
189
  def create_gradio_app():
190
+ with gr.Blocks(title="Lesbian Greek Parser") as app:
191
+ gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
 
 
 
 
192
 
 
193
  if loaded:
194
+ gr.Markdown(f"✅ Loaded variants: {', '.join(MODEL_VARIANTS.keys())}")
195
  else:
196
+ gr.Markdown(f"❌ Loading error: {status}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  with gr.Row():
199
  with gr.Column():
200
+ text_input = gr.Textbox(label="Text", lines=4,
201
+ placeholder="Εισάγετε κείμενο...")
202
+ parse_btn = gr.Button("Parse")
 
203
  with gr.Column():
204
+ model_sel = gr.Radio(choices=list(MODEL_VARIANTS.keys()),
205
+ value="Lesbian-only", label="Model Variant")
206
+
207
+ # sentence selector & state
208
+ sentence_dropdown = gr.Dropdown(label="Sentence", choices=[])
209
+ sentences_state = gr.State([])
210
+
211
+ # outputs
212
+ conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True)
213
+ table_out = gr.Dataframe(label="Tokens")
214
+ text_viz_out = gr.Textbox(label="Dependencies", lines=8, show_copy_button=True)
215
+ svg_out = gr.HTML("<p>No data</p>")
216
+
217
+ # wire up parse event
218
+ parse_btn.click(
219
+ fn=process_text,
220
+ inputs=[text_input, model_sel],
221
+ outputs=[
222
+ conllu_out, table_out, text_viz_out,
223
+ sentence_dropdown, sentences_state, svg_out
224
+ ]
225
+ )
226
+ # on sentence change
227
+ sentence_dropdown.change(
228
+ fn=update_svg,
229
+ inputs=[sentence_dropdown, sentences_state],
230
+ outputs=svg_out
231
+ )
232
 
233
  return app
234
 
 
235
  if __name__ == "__main__":
236
  app = create_gradio_app()
237
  app.launch()