sbompolas commited on
Commit
e4bdfd5
Β·
verified Β·
1 Parent(s): f73ba75

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -105
app.py CHANGED
@@ -6,7 +6,7 @@ import requests
6
  import traceback
7
  from pathlib import Path
8
 
9
- # 1. MODEL VARIANTS & INITIALIZATION
10
 
11
  LESBIAN_MODELS = {}
12
  MODEL_VARIANTS = {
@@ -28,18 +28,18 @@ def download_model_file(url, filename):
28
 
29
  def initialize_models():
30
  try:
31
- models_dir = Path("./models")
32
- models_dir.mkdir(exist_ok=True)
33
  for name, repo in MODEL_VARIANTS.items():
34
- out = models_dir/name
35
  out.mkdir(exist_ok=True)
36
  files = {
37
- "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
38
- "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
39
- "pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
40
- "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
41
  }
42
- for fn,url in files.items():
43
  tgt = out/fn
44
  if not tgt.exists() and not download_model_file(url, str(tgt)):
45
  return False, f"Failed to download {fn} for {name}"
@@ -60,11 +60,11 @@ def initialize_models():
60
  loaded, load_status = initialize_models()
61
 
62
 
63
- # 2. CONLL-U TO COARSE OUTPUT
64
 
65
  def stanza_doc_to_conllu(doc) -> str:
66
  lines = []
67
- for sid, sent in enumerate(doc.sentences,1):
68
  lines.append(f"# sent_id = {sid}")
69
  lines.append(f"# text = {sent.text}")
70
  for w in sent.words:
@@ -73,126 +73,109 @@ def stanza_doc_to_conllu(doc) -> str:
73
  w.lemma or "_", w.upos or "_",
74
  w.xpos or "_", w.feats or "_",
75
  str(w.head) if w.head is not None else "0",
76
- w.deprel or "_","_","_"
77
  ]
78
  lines.append("\t".join(fields))
79
  lines.append("") # blank line after each sentence
80
  return "\n".join(lines)
81
 
82
 
83
- # 3. TOKEN TABLE: insert comment‐rows + empty‐rows per sentence
84
 
85
- def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
86
  """
87
- Splits conllu into sentence‐blocks, then for each block:
88
- - (if not first) insert a blank row
89
- - insert '# sent_id = …' row
90
- - insert '# text = …' row
91
- - then all token rows
92
  """
93
- blocks = [b for b in conllu.split("\n\n") if b.strip()]
94
  records = []
 
95
  first = True
96
-
97
  for block in blocks:
98
  lines = block.splitlines()
99
- sid_line = lines[0]
100
- text_line = lines[1]
101
  token_lines = lines[2:]
102
-
103
  if not first:
104
  # blank row
105
- records.append({c:"" for c in
106
- ["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]
107
- })
108
  first = False
109
-
110
  # comment rows
111
- records.append({"Id": sid_line, "Form":"", "Lemma":"", "UPosTag":"",
112
- "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
113
- records.append({"Id": text_line, "Form":"", "Lemma":"", "UPosTag":"",
114
- "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
115
-
116
  # token rows
117
  for tl in token_lines:
118
  parts = tl.split("\t")
119
  if len(parts) < 10:
120
  continue
121
  records.append({
122
- "Id": parts[0],
123
- "Form": parts[1],
124
- "Lemma": parts[2],
125
  "UPosTag": parts[3],
126
  "XPosTag": parts[4],
127
- "Feats": parts[5],
128
- "Head": parts[6],
129
- "DepRel": parts[7],
130
- "Deps": parts[8],
131
- "Misc": parts[9]
132
  })
 
133
 
134
- return pd.DataFrame(records, columns=[
135
- "Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"
136
- ])
137
 
 
138
 
139
- # 4. TEXT-BASED DEPENDENCIES: blank + comment per sentence
140
-
141
- def create_dependency_visualization(conllu: str) -> str:
142
  """
143
- Splits by blank‐line into sentence‐blocks, then for each:
144
- - blank line (if not first)
145
- - comment lines
146
- - parse lines
147
  """
148
- blocks = [b for b in conllu.split("\n\n") if b.strip()]
149
- out = []
 
150
  first = True
151
-
152
- for block in blocks:
153
- lines = block.splitlines()
154
- sid_line, txt_line = lines[0], lines[1]
155
- token_lines = lines[2:]
156
-
157
- if not first:
158
- out.append("") # blank line separator
159
- first = False
160
-
161
- out.append(sid_line)
162
- out.append(txt_line)
163
-
164
- # build head→form map for this sentence
165
- id2form = {}
166
- for tl in token_lines:
167
- p = tl.split("\t")
168
- if len(p)>=2:
169
- id2form[p[0]] = p[1]
170
-
171
- for tl in token_lines:
172
- p = tl.split("\t")
173
- if len(p) < 8:
174
- continue
175
- w, upos, head, deprel = p[1], p[3], p[6], p[7]
176
- if head != "0" and head in id2form:
177
- out.append(f"{w} ({upos}) --{deprel}--> {id2form[head]}")
178
- else:
179
- out.append(f"{w} ({upos}) --{deprel}--> ROOT")
180
-
181
- return "\n".join(out)
182
 
183
 
184
- # 5. SVG BUILDER (unchanged)
185
 
186
  def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
187
- """
188
- Paste your entire original SVG‐generation code here unchanged.
189
- It takes sentence_data: List[dict] and returns an <svg>…</svg> string.
190
- """
191
- # … your SVG builder from the attached file …
192
- return "<svg><!-- your SVG here --></svg>"
193
 
194
 
195
- # 6. PROCESS + DROPDOWN
196
 
197
  def process_text(text, variant):
198
  if not text.strip():
@@ -209,24 +192,23 @@ def process_text(text, variant):
209
  [], "", pd.DataFrame(), ""
210
  )
211
 
212
- # parse
213
  doc = pipe(text)
214
  conllu = stanza_doc_to_conllu(doc)
215
 
216
- # build token table
217
- df = conllu_to_dataframe(conllu)
218
 
219
- # text-based deps
220
- text_v = create_dependency_visualization(conllu)
221
 
222
- # prepare sentence payloads for SVG
223
  sentences = []
224
  for sent in doc.sentences:
225
  payload = [{
226
  'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
227
  'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
228
  'FEATS': w.feats or "_", 'HEAD': w.head or 0,
229
- 'DEPREL': w.deprel or "_"
230
  } for w in sent.words]
231
  sentences.append(payload)
232
 
@@ -239,24 +221,23 @@ def process_text(text, variant):
239
  dd_upd,
240
  sentences,
241
  conllu,
242
- df,
243
- text_v
244
  )
245
 
246
  def update_svg(selected_id, sentences):
247
  try:
248
- idx = int(selected_id)-1
249
  return create_single_sentence_svg(sentences[idx])
250
  except:
251
  return "<p>Invalid selection</p>"
252
 
253
 
254
- # 7. GRADIO UI
255
 
256
  def create_app():
257
  with gr.Blocks(title="Lesbian Greek Parser") as app:
258
  gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
259
-
260
  if not loaded:
261
  gr.Markdown(f"❌ Load error: {load_status}")
262
 
@@ -283,7 +264,8 @@ def create_app():
283
  btn.click(
284
  fn=process_text,
285
  inputs=[txt, mdl],
286
- outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out, text_out]
 
287
  )
288
  sentence_dd.change(
289
  fn=update_svg,
 
6
  import traceback
7
  from pathlib import Path
8
 
9
+ # ─── 1. MODEL VARIANTS & INITIALIZATION ──────────────────────────────────
10
 
11
  LESBIAN_MODELS = {}
12
  MODEL_VARIANTS = {
 
28
 
29
  def initialize_models():
30
  try:
31
+ base = Path("./models")
32
+ base.mkdir(exist_ok=True)
33
  for name, repo in MODEL_VARIANTS.items():
34
+ out = base/name
35
  out.mkdir(exist_ok=True)
36
  files = {
37
+ "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
38
+ "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
39
+ "pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
40
+ "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
41
  }
42
+ for fn, url in files.items():
43
  tgt = out/fn
44
  if not tgt.exists() and not download_model_file(url, str(tgt)):
45
  return False, f"Failed to download {fn} for {name}"
 
60
  loaded, load_status = initialize_models()
61
 
62
 
63
+ # ─── 2. CoNLL-U STRINGIZER ────────────────────────────────────────────────
64
 
65
  def stanza_doc_to_conllu(doc) -> str:
66
  lines = []
67
+ for sid, sent in enumerate(doc.sentences, 1):
68
  lines.append(f"# sent_id = {sid}")
69
  lines.append(f"# text = {sent.text}")
70
  for w in sent.words:
 
73
  w.lemma or "_", w.upos or "_",
74
  w.xpos or "_", w.feats or "_",
75
  str(w.head) if w.head is not None else "0",
76
+ w.deprel or "_", "_", "_"
77
  ]
78
  lines.append("\t".join(fields))
79
  lines.append("") # blank line after each sentence
80
  return "\n".join(lines)
81
 
82
 
83
+ # ─── 3. TOKEN TABLE with comment‐rows ────────────────────────────────────
84
 
85
+ def conllu_to_dataframe_table(conllu: str) -> pd.DataFrame:
86
  """
87
+ Insert:
88
+ - blank row (except first)
89
+ - # sent_id = …
90
+ - # text = …
91
+ before each sentence's tokens.
92
  """
 
93
  records = []
94
+ blocks = [b for b in conllu.split("\n\n") if b.strip()]
95
  first = True
 
96
  for block in blocks:
97
  lines = block.splitlines()
98
+ sid_line, txt_line = lines[0], lines[1]
 
99
  token_lines = lines[2:]
 
100
  if not first:
101
  # blank row
102
+ records.append({c: "" for c in ["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]})
 
 
103
  first = False
 
104
  # comment rows
105
+ records.append({"Id": sid_line, "Form":"", "Lemma":"","UPosTag":"","XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
106
+ records.append({"Id": txt_line, "Form":"", "Lemma":"","UPosTag":"","XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
 
 
 
107
  # token rows
108
  for tl in token_lines:
109
  parts = tl.split("\t")
110
  if len(parts) < 10:
111
  continue
112
  records.append({
113
+ "Id": parts[0],
114
+ "Form": parts[1],
115
+ "Lemma": parts[2],
116
  "UPosTag": parts[3],
117
  "XPosTag": parts[4],
118
+ "Feats": parts[5],
119
+ "Head": parts[6],
120
+ "DepRel": parts[7],
121
+ "Deps": parts[8],
122
+ "Misc": parts[9]
123
  })
124
+ return pd.DataFrame(records, columns=["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"])
125
 
 
 
 
126
 
127
+ # ─── 4. TEXT‐BASED DEPENDENCIES with blank + comments ─────────────────────
128
 
129
+ def create_dependency_visualization(df_table: pd.DataFrame) -> str:
 
 
130
  """
131
+ Walk token‐table rows, emitting:
132
+ - blank line + comment lines before each sentence
133
+ - dependency lines
 
134
  """
135
+ if df_table.empty:
136
+ return "No data to visualize"
137
+ lines = []
138
  first = True
139
+ # We detect new sentence by comment rows in Id column starting with '# sent_id'
140
+ for row in df_table.itertuples(index=False):
141
+ if isinstance(row.Id, str) and row.Id.startswith("# sent_id"):
142
+ if not first:
143
+ lines.append("") # blank separator
144
+ first = False
145
+ # emit sent_id and text
146
+ lines.append(row.Id)
147
+ # next row in table is "# text = …"
148
+ continue
149
+ if isinstance(row.Id, str) and row.Id.startswith("# text"):
150
+ lines.append(row.Id)
151
+ continue
152
+ # skip blank/comment rows
153
+ if not row.Id.isdigit():
154
+ continue
155
+ # actual token row
156
+ w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
157
+ if h != "0":
158
+ # find head form
159
+ try:
160
+ hw = df_table[df_table.Id == h].iloc[0].Form
161
+ except:
162
+ hw = "[ERR]"
163
+ lines.append(f"{w} ({p}) --{d}--> {hw}")
164
+ else:
165
+ lines.append(f"{w} ({p}) --{d}--> ROOT")
166
+ return "\n".join(lines)
 
 
 
167
 
168
 
169
+ # ─── 5. SVG BUILDER (unchanged) ───────────────────────────────────────────
170
 
171
  def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
172
+ # Paste your entire original SVG‐generation code here unchanged
173
+ # It must accept sentence_data: list of dicts and return SVG string.
174
+ svg = "<svg><!-- your SVG here --></svg>"
175
+ return svg
 
 
176
 
177
 
178
+ # ─── 6. PROCESS & DROPDOWN LOGIC ──────────────────────────────────────────
179
 
180
  def process_text(text, variant):
181
  if not text.strip():
 
192
  [], "", pd.DataFrame(), ""
193
  )
194
 
 
195
  doc = pipe(text)
196
  conllu = stanza_doc_to_conllu(doc)
197
 
198
+ # token table with comments
199
+ df_table = conllu_to_dataframe_table(conllu)
200
 
201
+ # text-based dependencies
202
+ text_viz = create_dependency_visualization(df_table)
203
 
204
+ # prepare for SVG dropdown
205
  sentences = []
206
  for sent in doc.sentences:
207
  payload = [{
208
  'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
209
  'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
210
  'FEATS': w.feats or "_", 'HEAD': w.head or 0,
211
+ 'DEPREL': w.deprel or "_'
212
  } for w in sent.words]
213
  sentences.append(payload)
214
 
 
221
  dd_upd,
222
  sentences,
223
  conllu,
224
+ df_table,
225
+ text_viz
226
  )
227
 
228
  def update_svg(selected_id, sentences):
229
  try:
230
+ idx = int(selected_id) - 1
231
  return create_single_sentence_svg(sentences[idx])
232
  except:
233
  return "<p>Invalid selection</p>"
234
 
235
 
236
+ # ─── 7. GRADIO UI ─────────────────────────────────────────────────────────
237
 
238
  def create_app():
239
  with gr.Blocks(title="Lesbian Greek Parser") as app:
240
  gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
 
241
  if not loaded:
242
  gr.Markdown(f"❌ Load error: {load_status}")
243
 
 
264
  btn.click(
265
  fn=process_text,
266
  inputs=[txt, mdl],
267
+ outputs=[svg_out, sentence_dd, sentences_st,
268
+ conllu_out, table_out, text_out]
269
  )
270
  sentence_dd.change(
271
  fn=update_svg,