sbompolas commited on
Commit
4670ce3
·
verified ·
1 Parent(s): 08f1dd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -65
app.py CHANGED
@@ -31,7 +31,7 @@ def initialize_models():
31
  base = Path("./models")
32
  base.mkdir(exist_ok=True)
33
  for name, repo in MODEL_VARIANTS.items():
34
- out = base/name
35
  out.mkdir(exist_ok=True)
36
  files = {
37
  "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
@@ -40,7 +40,7 @@ def initialize_models():
40
  "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
41
  }
42
  for fn, url in files.items():
43
- tgt = out/fn
44
  if not tgt.exists() and not download_model_file(url, str(tgt)):
45
  return False, f"Failed to download {fn} for {name}"
46
  cfg = {
@@ -78,29 +78,26 @@ def stanza_doc_to_conllu(doc) -> str:
78
  w.deprel or "_", "_", "_"
79
  ]
80
  lines.append("\t".join(fields))
81
- lines.append("")
82
  return "\n".join(lines)
83
 
84
  def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
 
85
  rows = []
86
  for line in conllu.splitlines():
87
- # empty line → separator between sentences
88
  if not line:
89
- if rows and rows[-1] != {}:
90
- rows.append({})
91
  continue
92
-
93
- # comment line → sentence header
94
  if line.startswith("#"):
95
- # ex: "# sent_id = 2"
96
  if "=" in line:
97
  key, val = line[2:].split("=", 1)
98
- rows.append({
99
- 'ID': f"# {key.strip()} =",
100
- 'FORM': val.strip()
101
- })
 
102
  continue
103
-
104
  parts = line.split("\t")
105
  if len(parts) >= 10:
106
  rows.append({
@@ -115,25 +112,22 @@ def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
115
  'DEPS': parts[8],
116
  'MISC': parts[9]
117
  })
118
- return pd.DataFrame(rows)
 
119
 
120
  def create_dependency_visualization(df: pd.DataFrame) -> str:
121
  if df.empty:
122
  return "No data to visualize"
123
  viz = []
124
  for _, row in df.iterrows():
125
- rid = row.get("ID")
126
- # skip outright blank-rows skeleton
127
- if pd.isna(rid):
128
- continue
129
-
130
- # sentence header
131
  if isinstance(rid, str) and rid.startswith("#"):
132
- if viz:
133
- viz.append("") # blank line before new sentence
134
- viz.append(f"{rid} {row.get('FORM')}")
 
135
  continue
136
-
137
  w, p, d, h = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
138
  if h != '0':
139
  try:
@@ -161,10 +155,8 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
161
 
162
  width = max(800, word_count * word_spacing + 100)
163
  height = 500
164
-
165
  word_y = height - 120
166
- pos_y = word_y + 20
167
- features_start_y = pos_y + 15
168
 
169
  deprel_colors = {
170
  'root': '#000000', 'nsubj': '#2980b9', 'obj': '#27ae60', 'det': '#e67e22',
@@ -189,24 +181,22 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
189
  )
190
  svg.append('</defs><g>')
191
 
192
- # calculate x positions
193
- word_positions = {}
194
- for idx, row in df.iterrows():
195
- wid = int(row['ID'])
196
- word_positions[wid] = 50 + (wid - 1) * word_spacing
197
 
198
  # draw arcs/lines
199
  used_spans = []
200
  for _, row in df.iterrows():
201
- wid = int(row['ID'])
202
- hid = int(row['HEAD']) if row['HEAD'] != '0' else 0
 
203
  rel = row['DEPREL']
204
-
205
  x1 = word_positions[wid]
206
  col = deprel_colors.get(rel, '#000')
207
-
208
  if hid == 0:
209
- # root arrow
210
  svg.append(
211
  f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
212
  f'stroke="{col}" stroke-width="1.5"/>'
@@ -233,17 +223,13 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
233
  conflict = True
234
  break
235
  used_spans.append((span, lvl))
236
-
237
  dist = abs(x2 - x1)
238
- base_h = min(40 + dist * 0.15, 100)
239
- arc_h = base_h + lvl * 35
240
- midx = (x1 + x2) / 2
241
- cty = word_y - arc_h
242
  svg.append(
243
- f'<path d="M {x1} {word_y-15} Q {midx} {cty} {x2} {word_y-15}" '
244
  f'stroke="{col}" stroke-width="1.5" fill="none" marker-end="url(#arrow_{rel})"/>'
245
  )
246
- # label box
247
  amx = 0.25*x1 + 0.5*midx + 0.25*x2
248
  amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15)
249
  lw = len(rel)*6 + 8
@@ -258,31 +244,21 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
258
 
259
  # draw words + annotations
260
  for _, row in df.iterrows():
261
- wid = int(row['ID'])
262
- x = word_positions[wid]
263
- word = row['FORM']
264
- pos = row['UPOS']
265
- lemma = row['LEMMA']
266
- feats = row['FEATS']
267
- xpos = row['XPOS']
268
-
269
- # word text
270
  svg.append(
271
  f'<text x="{x}" y="{word_y}" text-anchor="middle" '
272
  'font-family="Arial" font-size="13" font-weight="bold">'
273
- f'{word}</text>'
274
  )
275
-
276
- # annotations underneath
277
  ann = []
278
- if pos and pos != '_': ann.append(f"upos={pos}")
279
- if lemma and lemma not in ('_', word): ann.append(f"lemma={lemma}")
280
- if xpos and xpos != '_': ann.append(f"xpos={xpos}")
281
- if feats and feats not in ('', '_'):
282
- for fpair in feats.split('|'):
283
  if '=' in fpair:
284
  ann.append(fpair)
285
-
286
  for i, a in enumerate(ann):
287
  y0 = features_start_y + i*12
288
  svg.append(
@@ -315,7 +291,7 @@ def process_text(text, variant):
315
  [], "", pd.DataFrame(), ""
316
  )
317
 
318
- doc = pipe(text)
319
  conllu = stanza_doc_to_conllu(doc)
320
  df = conllu_to_dataframe(conllu)
321
  text_v = create_dependency_visualization(df)
@@ -394,7 +370,7 @@ def create_app():
394
  table_out = gr.Dataframe(label="Token Table")
395
  text_out = gr.Textbox(
396
  label="Text-based Dependencies",
397
- lines=8,
398
  show_copy_button=True
399
  )
400
 
 
31
  base = Path("./models")
32
  base.mkdir(exist_ok=True)
33
  for name, repo in MODEL_VARIANTS.items():
34
+ out = base / name
35
  out.mkdir(exist_ok=True)
36
  files = {
37
  "tokenizer.pt": f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
 
40
  "depparse.pt": f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
41
  }
42
  for fn, url in files.items():
43
+ tgt = out / fn
44
  if not tgt.exists() and not download_model_file(url, str(tgt)):
45
  return False, f"Failed to download {fn} for {name}"
46
  cfg = {
 
78
  w.deprel or "_", "_", "_"
79
  ]
80
  lines.append("\t".join(fields))
81
+ lines.append("") # blank line after each sentence
82
  return "\n".join(lines)
83
 
84
  def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
85
+ cols = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC']
86
  rows = []
87
  for line in conllu.splitlines():
 
88
  if not line:
89
+ # empty row between sentences
90
+ rows.append({c: "" for c in cols})
91
  continue
 
 
92
  if line.startswith("#"):
 
93
  if "=" in line:
94
  key, val = line[2:].split("=", 1)
95
+ key, val = key.strip(), val.strip()
96
+ if key == "sent_id":
97
+ rows.append({'ID': f"# sent_id = {val}", 'FORM': ""})
98
+ elif key == "text":
99
+ rows.append({'ID': f"# text = {val}", 'FORM': ""})
100
  continue
 
101
  parts = line.split("\t")
102
  if len(parts) >= 10:
103
  rows.append({
 
112
  'DEPS': parts[8],
113
  'MISC': parts[9]
114
  })
115
+ df = pd.DataFrame(rows, columns=cols)
116
+ return df.fillna("") # replace NaN with empty strings
117
 
118
  def create_dependency_visualization(df: pd.DataFrame) -> str:
119
  if df.empty:
120
  return "No data to visualize"
121
  viz = []
122
  for _, row in df.iterrows():
123
+ rid = row['ID']
124
+ # sentence header rows
 
 
 
 
125
  if isinstance(rid, str) and rid.startswith("#"):
126
+ # only before each new sent_id do we add a blank line
127
+ if rid.startswith("# sent_id") and viz:
128
+ viz.append("")
129
+ viz.append(rid)
130
  continue
 
131
  w, p, d, h = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
132
  if h != '0':
133
  try:
 
155
 
156
  width = max(800, word_count * word_spacing + 100)
157
  height = 500
 
158
  word_y = height - 120
159
+ features_start_y = word_y + 35 # space for UPOS + lemma
 
160
 
161
  deprel_colors = {
162
  'root': '#000000', 'nsubj': '#2980b9', 'obj': '#27ae60', 'det': '#e67e22',
 
181
  )
182
  svg.append('</defs><g>')
183
 
184
+ # x positions
185
+ word_positions = {
186
+ int(r['ID']): 50 + (int(r['ID']) - 1) * word_spacing
187
+ for _, r in df.iterrows() if str(r['ID']).isdigit()
188
+ }
189
 
190
  # draw arcs/lines
191
  used_spans = []
192
  for _, row in df.iterrows():
193
+ if not str(row['ID']).isdigit():
194
+ continue
195
+ wid, hid = int(row['ID']), int(row['HEAD'])
196
  rel = row['DEPREL']
 
197
  x1 = word_positions[wid]
198
  col = deprel_colors.get(rel, '#000')
 
199
  if hid == 0:
 
200
  svg.append(
201
  f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
202
  f'stroke="{col}" stroke-width="1.5"/>'
 
223
  conflict = True
224
  break
225
  used_spans.append((span, lvl))
 
226
  dist = abs(x2 - x1)
227
+ arc_h = min(40 + dist * 0.15, 100) + lvl * 35
228
+ midx, cty = (x1 + x2) / 2, word_y - arc_h
 
 
229
  svg.append(
230
+ f'<path d="M{x1} {word_y-15} Q{midx} {cty} {x2} {word_y-15}" '
231
  f'stroke="{col}" stroke-width="1.5" fill="none" marker-end="url(#arrow_{rel})"/>'
232
  )
 
233
  amx = 0.25*x1 + 0.5*midx + 0.25*x2
234
  amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15)
235
  lw = len(rel)*6 + 8
 
244
 
245
  # draw words + annotations
246
  for _, row in df.iterrows():
247
+ if not str(row['ID']).isdigit():
248
+ continue
249
+ x = word_positions[int(row['ID'])]
 
 
 
 
 
 
250
  svg.append(
251
  f'<text x="{x}" y="{word_y}" text-anchor="middle" '
252
  'font-family="Arial" font-size="13" font-weight="bold">'
253
+ f'{row["FORM"]}</text>'
254
  )
 
 
255
  ann = []
256
+ if row['UPOS'] and row['UPOS'] != '_': ann.append(f"upos={row['UPOS']}")
257
+ if row['LEMMA'] and row['LEMMA'] not in ('_', row['FORM']): ann.append(f"lemma={row['LEMMA']}")
258
+ if row['FEATS'] and row['FEATS'] not in ('', '_'):
259
+ for fpair in row['FEATS'].split('|'):
 
260
  if '=' in fpair:
261
  ann.append(fpair)
 
262
  for i, a in enumerate(ann):
263
  y0 = features_start_y + i*12
264
  svg.append(
 
291
  [], "", pd.DataFrame(), ""
292
  )
293
 
294
+ doc = pipe(text)
295
  conllu = stanza_doc_to_conllu(doc)
296
  df = conllu_to_dataframe(conllu)
297
  text_v = create_dependency_visualization(df)
 
370
  table_out = gr.Dataframe(label="Token Table")
371
  text_out = gr.Textbox(
372
  label="Text-based Dependencies",
373
+ lines=10,
374
  show_copy_button=True
375
  )
376