sbompolas commited on
Commit
f73ba75
Β·
verified Β·
1 Parent(s): 78b8555

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -104
app.py CHANGED
@@ -6,7 +6,7 @@ import requests
6
  import traceback
7
  from pathlib import Path
8
 
9
- # ─── 1. MODEL VARIANTS & INITIALIZATION ──────────────────────────────────────
10
 
11
  LESBIAN_MODELS = {}
12
  MODEL_VARIANTS = {
@@ -34,20 +34,18 @@ def initialize_models():
34
  out = models_dir/name
35
  out.mkdir(exist_ok=True)
36
  files = {
37
- "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
38
- "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
39
- "pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
40
- "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
41
  }
42
- for fn, url in files.items():
43
  tgt = out/fn
44
  if not tgt.exists() and not download_model_file(url, str(tgt)):
45
  return False, f"Failed to download {fn} for {name}"
46
  cfg = {
47
  'processors': 'tokenize,pos,lemma,depparse',
48
- 'lang': 'el',
49
- 'use_gpu': False,
50
- 'verbose': False,
51
  'tokenize_model_path': str(out/"tokenizer.pt"),
52
  'pos_model_path': str(out/"pos.pt"),
53
  'lemma_model_path': str(out/"lemmatizer.pt"),
@@ -62,95 +60,139 @@ def initialize_models():
62
  loaded, load_status = initialize_models()
63
 
64
 
65
- # ─── 2. CONLL-U / TOKEN TABLE / TEXT-BASED VIZ ────────────────────────────────
66
 
67
  def stanza_doc_to_conllu(doc) -> str:
68
  lines = []
69
- for sid, sent in enumerate(doc.sentences, 1):
70
  lines.append(f"# sent_id = {sid}")
71
  lines.append(f"# text = {sent.text}")
72
  for w in sent.words:
73
  fields = [
74
- str(w.id),
75
- w.text,
76
- w.lemma or "_",
77
- w.upos or "_",
78
- w.xpos or "_",
79
- w.feats or "_",
80
  str(w.head) if w.head is not None else "0",
81
- w.deprel or "_",
82
- "_",
83
- "_"
84
  ]
85
  lines.append("\t".join(fields))
86
- lines.append("")
87
  return "\n".join(lines)
88
 
 
 
 
89
  def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
90
- """Return only the 10 token columns, renamed per spec."""
91
- rows = []
92
- for line in conllu.splitlines():
93
- if not line or line.startswith("#"):
94
- continue
95
- parts = line.split("\t")
96
- if len(parts) < 10:
97
- continue
98
- rows.append({
99
- "Id": parts[0],
100
- "Form": parts[1],
101
- "Lemma": parts[2],
102
- "UPosTag": parts[3],
103
- "XPosTag": parts[4],
104
- "Feats": parts[5],
105
- "Head": parts[6],
106
- "DepRel": parts[7],
107
- "Deps": parts[8],
108
- "Misc": parts[9],
109
- })
110
- return pd.DataFrame(rows, columns=[
111
- "Id","Form","Lemma","UPosTag","XPosTag",
112
- "Feats","Head","DepRel","Deps","Misc"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  ])
114
 
115
- def create_dependency_visualization(df: pd.DataFrame) -> str:
 
 
 
116
  """
117
- Blank line before every new sentence (ID == '1'), except the first.
 
 
 
118
  """
119
- if df.empty:
120
- return "No data to visualize"
121
-
122
- lines = []
123
  first = True
124
- for row in df.itertuples(index=False):
125
- if row.Id == "1":
126
- if not first:
127
- lines.append("") # blank line between sentences
128
- first = False
129
-
130
- w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
131
- if h != "0":
132
- try:
133
- hw = df[df.Id == h].iloc[0].Form
134
- except:
135
- hw = "[ERR]"
136
- lines.append(f"{w} ({p}) --{d}--> {hw}")
137
- else:
138
- lines.append(f"{w} ({p}) --{d}--> ROOT")
139
 
140
- return "\n".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
 
 
 
 
142
 
143
- # ─── 3. FULL SVG BUILDER ─────────────────────────────────────────────────────
144
  def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
145
  """
146
- Paste your original create_single_sentence_svg code here unchanged.
147
- It expects sentence_data: List[Dict] and returns <svg>…</svg>.
148
  """
149
- # … your full SVG builder …
150
  return "<svg><!-- your SVG here --></svg>"
151
 
152
 
153
- # ─── 4. PROCESS & DROPDOWN-UPDATES ───────────────────────────────────────────
154
 
155
  def process_text(text, variant):
156
  if not text.strip():
@@ -167,12 +209,17 @@ def process_text(text, variant):
167
  [], "", pd.DataFrame(), ""
168
  )
169
 
170
- doc = pipe(text)
 
171
  conllu = stanza_doc_to_conllu(doc)
 
 
172
  df = conllu_to_dataframe(conllu)
173
- text_v = create_dependency_visualization(df)
174
 
175
- # build sentence‐level payload for SVG
 
 
 
176
  sentences = []
177
  for sent in doc.sentences:
178
  payload = [{
@@ -198,13 +245,13 @@ def process_text(text, variant):
198
 
199
  def update_svg(selected_id, sentences):
200
  try:
201
- idx = int(selected_id) - 1
202
  return create_single_sentence_svg(sentences[idx])
203
  except:
204
  return "<p>Invalid selection</p>"
205
 
206
 
207
- # ─── 5. BUILD THE GRADIO INTERFACE ──────────────────────────────────────────
208
 
209
  def create_app():
210
  with gr.Blocks(title="Lesbian Greek Parser") as app:
@@ -215,16 +262,10 @@ def create_app():
215
 
216
  with gr.Row():
217
  with gr.Column():
218
- txt = gr.Textbox(
219
- label="Input Text",
220
- lines=4,
221
- placeholder="ΕισάγΡτΡ κΡίμΡνο…"
222
- )
223
- mdl = gr.Radio(
224
- choices=list(MODEL_VARIANTS.keys()),
225
- value="Lesbian-only",
226
- label="Model Variant"
227
- )
228
  btn = gr.Button("Parse", variant="primary")
229
 
230
  with gr.Row():
@@ -235,29 +276,14 @@ def create_app():
235
 
236
  with gr.Row():
237
  with gr.Column():
238
- conllu_out = gr.Textbox(
239
- label="CoNLL-U",
240
- lines=10,
241
- show_copy_button=True
242
- )
243
  table_out = gr.Dataframe(label="Token Table")
244
- text_out = gr.Textbox(
245
- label="Text-based Dependencies",
246
- lines=8,
247
- show_copy_button=True
248
- )
249
 
250
  btn.click(
251
  fn=process_text,
252
  inputs=[txt, mdl],
253
- outputs=[
254
- svg_out,
255
- sentence_dd,
256
- sentences_st,
257
- conllu_out,
258
- table_out,
259
- text_out
260
- ]
261
  )
262
  sentence_dd.change(
263
  fn=update_svg,
 
6
  import traceback
7
  from pathlib import Path
8
 
9
+ # 1. MODEL VARIANTS & INITIALIZATION
10
 
11
  LESBIAN_MODELS = {}
12
  MODEL_VARIANTS = {
 
34
  out = models_dir/name
35
  out.mkdir(exist_ok=True)
36
  files = {
37
+ "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
38
+ "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
39
+ "pos.pt": f"https://huggingface.co/{repo}/resolve/main/pos.pt",
40
+ "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
41
  }
42
+ for fn,url in files.items():
43
  tgt = out/fn
44
  if not tgt.exists() and not download_model_file(url, str(tgt)):
45
  return False, f"Failed to download {fn} for {name}"
46
  cfg = {
47
  'processors': 'tokenize,pos,lemma,depparse',
48
+ 'lang': 'el', 'use_gpu': False, 'verbose': False,
 
 
49
  'tokenize_model_path': str(out/"tokenizer.pt"),
50
  'pos_model_path': str(out/"pos.pt"),
51
  'lemma_model_path': str(out/"lemmatizer.pt"),
 
60
  loaded, load_status = initialize_models()
61
 
62
 
63
+ # 2. CONLL-U TO COARSE OUTPUT
64
 
65
  def stanza_doc_to_conllu(doc) -> str:
66
  lines = []
67
+ for sid, sent in enumerate(doc.sentences,1):
68
  lines.append(f"# sent_id = {sid}")
69
  lines.append(f"# text = {sent.text}")
70
  for w in sent.words:
71
  fields = [
72
+ str(w.id), w.text,
73
+ w.lemma or "_", w.upos or "_",
74
+ w.xpos or "_", w.feats or "_",
 
 
 
75
  str(w.head) if w.head is not None else "0",
76
+ w.deprel or "_","_","_"
 
 
77
  ]
78
  lines.append("\t".join(fields))
79
+ lines.append("") # blank line after each sentence
80
  return "\n".join(lines)
81
 
82
+
83
+ # 3. TOKEN TABLE: insert comment‐rows + empty‐rows per sentence
84
+
85
  def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
86
+ """
87
+ Splits conllu into sentence‐blocks, then for each block:
88
+ - (if not first) insert a blank row
89
+ - insert '# sent_id = …' row
90
+ - insert '# text = …' row
91
+ - then all token rows
92
+ """
93
+ blocks = [b for b in conllu.split("\n\n") if b.strip()]
94
+ records = []
95
+ first = True
96
+
97
+ for block in blocks:
98
+ lines = block.splitlines()
99
+ sid_line = lines[0]
100
+ text_line = lines[1]
101
+ token_lines = lines[2:]
102
+
103
+ if not first:
104
+ # blank row
105
+ records.append({c:"" for c in
106
+ ["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]
107
+ })
108
+ first = False
109
+
110
+ # comment rows
111
+ records.append({"Id": sid_line, "Form":"", "Lemma":"", "UPosTag":"",
112
+ "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
113
+ records.append({"Id": text_line, "Form":"", "Lemma":"", "UPosTag":"",
114
+ "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
115
+
116
+ # token rows
117
+ for tl in token_lines:
118
+ parts = tl.split("\t")
119
+ if len(parts) < 10:
120
+ continue
121
+ records.append({
122
+ "Id": parts[0],
123
+ "Form": parts[1],
124
+ "Lemma": parts[2],
125
+ "UPosTag": parts[3],
126
+ "XPosTag": parts[4],
127
+ "Feats": parts[5],
128
+ "Head": parts[6],
129
+ "DepRel": parts[7],
130
+ "Deps": parts[8],
131
+ "Misc": parts[9]
132
+ })
133
+
134
+ return pd.DataFrame(records, columns=[
135
+ "Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"
136
  ])
137
 
138
+
139
+ # 4. TEXT-BASED DEPENDENCIES: blank + comment per sentence
140
+
141
+ def create_dependency_visualization(conllu: str) -> str:
142
  """
143
+ Splits by blank‐line into sentence‐blocks, then for each:
144
+ - blank line (if not first)
145
+ - comment lines
146
+ - parse lines
147
  """
148
+ blocks = [b for b in conllu.split("\n\n") if b.strip()]
149
+ out = []
 
 
150
  first = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
+ for block in blocks:
153
+ lines = block.splitlines()
154
+ sid_line, txt_line = lines[0], lines[1]
155
+ token_lines = lines[2:]
156
+
157
+ if not first:
158
+ out.append("") # blank line separator
159
+ first = False
160
+
161
+ out.append(sid_line)
162
+ out.append(txt_line)
163
+
164
+ # build head→form map for this sentence
165
+ id2form = {}
166
+ for tl in token_lines:
167
+ p = tl.split("\t")
168
+ if len(p)>=2:
169
+ id2form[p[0]] = p[1]
170
+
171
+ for tl in token_lines:
172
+ p = tl.split("\t")
173
+ if len(p) < 8:
174
+ continue
175
+ w, upos, head, deprel = p[1], p[3], p[6], p[7]
176
+ if head != "0" and head in id2form:
177
+ out.append(f"{w} ({upos}) --{deprel}--> {id2form[head]}")
178
+ else:
179
+ out.append(f"{w} ({upos}) --{deprel}--> ROOT")
180
 
181
+ return "\n".join(out)
182
+
183
+
184
+ # 5. SVG BUILDER (unchanged)
185
 
 
186
  def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
187
  """
188
+ Paste your entire original SVG‐generation code here unchanged.
189
+ It takes sentence_data: List[dict] and returns an <svg>…</svg> string.
190
  """
191
+ # … your SVG builder from the attached file …
192
  return "<svg><!-- your SVG here --></svg>"
193
 
194
 
195
+ # 6. PROCESS + DROPDOWN
196
 
197
  def process_text(text, variant):
198
  if not text.strip():
 
209
  [], "", pd.DataFrame(), ""
210
  )
211
 
212
+ # parse
213
+ doc = pipe(text)
214
  conllu = stanza_doc_to_conllu(doc)
215
+
216
+ # build token table
217
  df = conllu_to_dataframe(conllu)
 
218
 
219
+ # text-based deps
220
+ text_v = create_dependency_visualization(conllu)
221
+
222
+ # prepare sentence payloads for SVG
223
  sentences = []
224
  for sent in doc.sentences:
225
  payload = [{
 
245
 
246
  def update_svg(selected_id, sentences):
247
  try:
248
+ idx = int(selected_id)-1
249
  return create_single_sentence_svg(sentences[idx])
250
  except:
251
  return "<p>Invalid selection</p>"
252
 
253
 
254
+ # 7. GRADIO UI
255
 
256
  def create_app():
257
  with gr.Blocks(title="Lesbian Greek Parser") as app:
 
262
 
263
  with gr.Row():
264
  with gr.Column():
265
+ txt = gr.Textbox(label="Input Text", lines=4,
266
+ placeholder="ΕισάγΡτΡ κΡίμΡνο…")
267
+ mdl = gr.Radio(choices=list(MODEL_VARIANTS.keys()),
268
+ value="Lesbian-only", label="Model Variant")
 
 
 
 
 
 
269
  btn = gr.Button("Parse", variant="primary")
270
 
271
  with gr.Row():
 
276
 
277
  with gr.Row():
278
  with gr.Column():
279
+ conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True)
 
 
 
 
280
  table_out = gr.Dataframe(label="Token Table")
281
+ text_out = gr.Textbox(label="Text-based Dependencies", lines=8, show_copy_button=True)
 
 
 
 
282
 
283
  btn.click(
284
  fn=process_text,
285
  inputs=[txt, mdl],
286
+ outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out, text_out]
 
 
 
 
 
 
 
287
  )
288
  sentence_dd.change(
289
  fn=update_svg,