sbompolas commited on
Commit
103058d
Β·
verified Β·
1 Parent(s): e4bdfd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -66
app.py CHANGED
@@ -6,8 +6,7 @@ import requests
6
  import traceback
7
  from pathlib import Path
8
 
9
- # ─── 1. MODEL VARIANTS & INITIALIZATION ──────────────────────────────────
10
-
11
  LESBIAN_MODELS = {}
12
  MODEL_VARIANTS = {
13
  "Lesbian-only": "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
@@ -34,10 +33,10 @@ def initialize_models():
34
  out = base/name
35
  out.mkdir(exist_ok=True)
36
  files = {
37
- "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
38
- "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
39
- "pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
40
- "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
41
  }
42
  for fn, url in files.items():
43
  tgt = out/fn
@@ -60,7 +59,7 @@ def initialize_models():
60
  loaded, load_status = initialize_models()
61
 
62
 
63
- # ─── 2. CoNLL-U STRINGIZER ────────────────────────────────────────────────
64
 
65
  def stanza_doc_to_conllu(doc) -> str:
66
  lines = []
@@ -80,102 +79,137 @@ def stanza_doc_to_conllu(doc) -> str:
80
  return "\n".join(lines)
81
 
82
 
83
- # ─── 3. TOKEN TABLE with comment‐rows ────────────────────────────────────
84
 
85
- def conllu_to_dataframe_table(conllu: str) -> pd.DataFrame:
86
  """
87
- Insert:
88
- - blank row (except first)
89
- - # sent_id = …
90
- - # text = …
91
- before each sentence's tokens.
92
  """
93
- records = []
94
  blocks = [b for b in conllu.split("\n\n") if b.strip()]
 
95
  first = True
 
96
  for block in blocks:
97
  lines = block.splitlines()
98
- sid_line, txt_line = lines[0], lines[1]
 
99
  token_lines = lines[2:]
 
100
  if not first:
101
- # blank row
102
- records.append({c: "" for c in ["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]})
 
 
103
  first = False
 
104
  # comment rows
105
- records.append({"Id": sid_line, "Form":"", "Lemma":"","UPosTag":"","XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
106
- records.append({"Id": txt_line, "Form":"", "Lemma":"","UPosTag":"","XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
 
 
 
107
  # token rows
108
  for tl in token_lines:
109
  parts = tl.split("\t")
110
  if len(parts) < 10:
111
  continue
112
  records.append({
113
- "Id": parts[0],
114
- "Form": parts[1],
115
- "Lemma": parts[2],
116
- "UPosTag": parts[3],
117
- "XPosTag": parts[4],
118
- "Feats": parts[5],
119
- "Head": parts[6],
120
- "DepRel": parts[7],
121
- "Deps": parts[8],
122
- "Misc": parts[9]
123
  })
124
- return pd.DataFrame(records, columns=["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"])
125
 
 
 
 
126
 
127
- # ─── 4. TEXT‐BASED DEPENDENCIES with blank + comments ─────────────��───────
 
128
 
129
  def create_dependency_visualization(df_table: pd.DataFrame) -> str:
130
  """
131
- Walk token‐table rows, emitting:
132
- - blank line + comment lines before each sentence
133
- - dependency lines
134
  """
135
  if df_table.empty:
136
  return "No data to visualize"
137
- lines = []
138
  first = True
139
- # We detect new sentence by comment rows in Id column starting with '# sent_id'
140
  for row in df_table.itertuples(index=False):
 
141
  if isinstance(row.Id, str) and row.Id.startswith("# sent_id"):
142
  if not first:
143
- lines.append("") # blank separator
144
  first = False
145
- # emit sent_id and text
146
- lines.append(row.Id)
147
- # next row in table is "# text = …"
148
  continue
149
  if isinstance(row.Id, str) and row.Id.startswith("# text"):
150
- lines.append(row.Id)
151
  continue
152
- # skip blank/comment rows
 
153
  if not row.Id.isdigit():
154
  continue
155
- # actual token row
 
156
  w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
157
  if h != "0":
158
- # find head form
159
  try:
160
  hw = df_table[df_table.Id == h].iloc[0].Form
161
  except:
162
  hw = "[ERR]"
163
- lines.append(f"{w} ({p}) --{d}--> {hw}")
164
  else:
165
- lines.append(f"{w} ({p}) --{d}--> ROOT")
166
- return "\n".join(lines)
167
-
168
 
169
- # ─── 5. SVG BUILDER (unchanged) ───────────────────────────────────────────
170
 
171
- def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
172
- # Paste your entire original SVG‐generation code here unchanged
173
- # It must accept sentence_data: list of dicts and return SVG string.
174
- svg = "<svg><!-- your SVG here --></svg>"
175
- return svg
176
 
 
177
 
178
- # ─── 6. PROCESS & DROPDOWN LOGIC ──────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  def process_text(text, variant):
181
  if not text.strip():
@@ -185,9 +219,9 @@ def process_text(text, variant):
185
  [], "", pd.DataFrame(), ""
186
  )
187
  pipe = LESBIAN_MODELS.get(variant)
188
- if pipe is None:
189
  return (
190
- gr.HTML.update(value="<p>Error: model not loaded</p>"),
191
  gr.Dropdown.update(choices=[], value=None),
192
  [], "", pd.DataFrame(), ""
193
  )
@@ -195,20 +229,20 @@ def process_text(text, variant):
195
  doc = pipe(text)
196
  conllu = stanza_doc_to_conllu(doc)
197
 
198
- # token table with comments
199
- df_table = conllu_to_dataframe_table(conllu)
200
 
201
- # text-based dependencies
202
  text_viz = create_dependency_visualization(df_table)
203
 
204
- # prepare for SVG dropdown
205
  sentences = []
206
  for sent in doc.sentences:
207
  payload = [{
208
  'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
209
  'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
210
  'FEATS': w.feats or "_", 'HEAD': w.head or 0,
211
- 'DEPREL': w.deprel or "_'
212
  } for w in sent.words]
213
  sentences.append(payload)
214
 
@@ -233,7 +267,7 @@ def update_svg(selected_id, sentences):
233
  return "<p>Invalid selection</p>"
234
 
235
 
236
- # ─── 7. GRADIO UI ─────────────────────────────────────────────────────────
237
 
238
  def create_app():
239
  with gr.Blocks(title="Lesbian Greek Parser") as app:
@@ -264,8 +298,10 @@ def create_app():
264
  btn.click(
265
  fn=process_text,
266
  inputs=[txt, mdl],
267
- outputs=[svg_out, sentence_dd, sentences_st,
268
- conllu_out, table_out, text_out]
 
 
269
  )
270
  sentence_dd.change(
271
  fn=update_svg,
 
6
  import traceback
7
  from pathlib import Path
8
 
9
+ # 1. MODEL VARIANTS & INITIALIZATION
 
10
  LESBIAN_MODELS = {}
11
  MODEL_VARIANTS = {
12
  "Lesbian-only": "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
 
33
  out = base/name
34
  out.mkdir(exist_ok=True)
35
  files = {
36
+ "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
37
+ "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
38
+ "pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
39
+ "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
40
  }
41
  for fn, url in files.items():
42
  tgt = out/fn
 
59
  loaded, load_status = initialize_models()
60
 
61
 
62
+ # 2. CoNLL-U STRINGIZER
63
 
64
  def stanza_doc_to_conllu(doc) -> str:
65
  lines = []
 
79
  return "\n".join(lines)
80
 
81
 
82
+ # 3. TOKEN TABLE WITH COMMENT‐ROWS
83
 
84
+ def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
85
  """
86
+ Before each sentence (except first):
87
+ - an empty row
88
+ - a row with '# sent_id = …'
89
+ - a row with '# text = …'
90
+ Then the token rows.
91
  """
 
92
  blocks = [b for b in conllu.split("\n\n") if b.strip()]
93
+ records = []
94
  first = True
95
+
96
  for block in blocks:
97
  lines = block.splitlines()
98
+ sid_line = lines[0]
99
+ txt_line = lines[1]
100
  token_lines = lines[2:]
101
+
102
  if not first:
103
+ # empty row
104
+ records.append({c:"" for c in
105
+ ["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]
106
+ })
107
  first = False
108
+
109
  # comment rows
110
+ records.append({"Id": sid_line, "Form":"", "Lemma":"", "UPosTag":"",
111
+ "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
112
+ records.append({"Id": txt_line, "Form":"", "Lemma":"", "UPosTag":"",
113
+ "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
114
+
115
  # token rows
116
  for tl in token_lines:
117
  parts = tl.split("\t")
118
  if len(parts) < 10:
119
  continue
120
  records.append({
121
+ "Id": parts[0],
122
+ "Form": parts[1],
123
+ "Lemma": parts[2],
124
+ "UPosTag": parts[3],
125
+ "XPosTag": parts[4],
126
+ "Feats": parts[5],
127
+ "Head": parts[6],
128
+ "DepRel": parts[7],
129
+ "Deps": parts[8],
130
+ "Misc": parts[9]
131
  })
 
132
 
133
+ return pd.DataFrame(records, columns=[
134
+ "Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"
135
+ ])
136
 
137
+
138
+ # 4. TEXT‐BASED DEPENDENCIES WITH BLANK+COMMENTS
139
 
140
  def create_dependency_visualization(df_table: pd.DataFrame) -> str:
141
  """
142
+ Inserts a blank line + comment lines before each sentence.
 
 
143
  """
144
  if df_table.empty:
145
  return "No data to visualize"
146
+ out = []
147
  first = True
148
+
149
  for row in df_table.itertuples(index=False):
150
+ # detect comment row
151
  if isinstance(row.Id, str) and row.Id.startswith("# sent_id"):
152
  if not first:
153
+ out.append("") # blank line
154
  first = False
155
+ out.append(row.Id)
 
 
156
  continue
157
  if isinstance(row.Id, str) and row.Id.startswith("# text"):
158
+ out.append(row.Id)
159
  continue
160
+
161
+ # skip blank/comment
162
  if not row.Id.isdigit():
163
  continue
164
+
165
+ # token row
166
  w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
167
  if h != "0":
 
168
  try:
169
  hw = df_table[df_table.Id == h].iloc[0].Form
170
  except:
171
  hw = "[ERR]"
172
+ out.append(f"{w} ({p}) --{d}--> {hw}")
173
  else:
174
+ out.append(f"{w} ({p}) --{d}--> ROOT")
 
 
175
 
176
+ return "\n".join(out)
177
 
 
 
 
 
 
178
 
179
+ # 5. FULL SVG BUILDER (UNCHANGED)
180
 
181
+ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
182
+ """
183
+ Paste your complete SVG‐generation function here, exactly as before.
184
+ sentence_data is a list of dicts; return an <svg>…</svg> string.
185
+ """
186
+ # -- your original code below --
187
+ df = pd.DataFrame(sentence_data)
188
+ word_count = len(df)
189
+ base_w, min_sp = 100, 30
190
+ spacing = max(base_w, (word_count*base_w + min_sp*(word_count-1))/word_count)
191
+ width = max(800, word_count*spacing + 100)
192
+ height = 500
193
+ word_y = height - 120
194
+ features_start_y = word_y + 20 + 15
195
+
196
+ deprel_colors = {
197
+ 'root': '#000000', 'nsubj':'#2980b9', 'obj':'#27ae60', 'det':'#e67e22',
198
+ # … and the rest …
199
+ }
200
+
201
+ svg_parts = [
202
+ f'<svg width="{width}" height="{height}" xmlns="http://www.w3.org/2000/svg" '
203
+ 'style="background:white;border:1px solid #eee"><defs>'
204
+ ]
205
+ # … arrowhead markers, arcs, labels, words & feats exactly as before …
206
+ svg_parts.append('</defs><g>')
207
+ # (Insert your entire previous implementation here)
208
+ svg_parts.append('</g></svg>')
209
+ return "".join(svg_parts)
210
+
211
+
212
+ # 6. PROCESS + DROPDOWN
213
 
214
  def process_text(text, variant):
215
  if not text.strip():
 
219
  [], "", pd.DataFrame(), ""
220
  )
221
  pipe = LESBIAN_MODELS.get(variant)
222
+ if not pipe:
223
  return (
224
+ gr.HTML.update(value="<p>Error loading model</p>"),
225
  gr.Dropdown.update(choices=[], value=None),
226
  [], "", pd.DataFrame(), ""
227
  )
 
229
  doc = pipe(text)
230
  conllu = stanza_doc_to_conllu(doc)
231
 
232
+ # Token table with comments
233
+ df_table = conllu_to_dataframe(conllu)
234
 
235
+ # Text‐based dependencies
236
  text_viz = create_dependency_visualization(df_table)
237
 
238
+ # Sentence payloads for SVG
239
  sentences = []
240
  for sent in doc.sentences:
241
  payload = [{
242
  'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
243
  'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
244
  'FEATS': w.feats or "_", 'HEAD': w.head or 0,
245
+ 'DEPREL': w.deprel or "_"
246
  } for w in sent.words]
247
  sentences.append(payload)
248
 
 
267
  return "<p>Invalid selection</p>"
268
 
269
 
270
+ # 7. GRADIO UI
271
 
272
  def create_app():
273
  with gr.Blocks(title="Lesbian Greek Parser") as app:
 
298
  btn.click(
299
  fn=process_text,
300
  inputs=[txt, mdl],
301
+ outputs=[
302
+ svg_out, sentence_dd, sentences_st,
303
+ conllu_out, table_out, text_out
304
+ ]
305
  )
306
  sentence_dd.change(
307
  fn=update_svg,