emvecchi commited on
Commit
d9443a6
·
verified ·
1 Parent(s): d9a5990

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -68
app.py CHANGED
@@ -4,10 +4,7 @@ from dataclasses import dataclass, field
4
  from typing import List, Optional, Dict
5
  from PIL import Image
6
 
7
- import re
8
- import textwrap
9
- import uuid
10
- import html as py_html
11
  from pathlib import Path
12
 
13
  import numpy as np
@@ -170,114 +167,129 @@ def display_image(image_path):
170
  img = Image.open(f)
171
  st.image(img, caption='8 most contributing properties', use_column_width=True)
172
 
173
- SPEAKER_RE = re.compile(r'^\s*\*\*(T|P):\*\*\s*(.*)$') # matches **T:** or **P:** at start
 
 
 
174
 
175
  def _read_md_any(path: str) -> str:
176
- """Read a UTF-8 .md from HF dataset or local disk based on your `filesystem` flag."""
177
- full = path
178
- if not full.startswith(input_repo_path):
179
- full = f"{input_repo_path}/{path}" # allow "dialogues/foo.md"
180
  if filesystem == "hf":
181
  with hf_fs.open(full, "rb") as f:
182
  return f.read().decode("utf-8")
183
- else:
184
- return Path(full).read_text(encoding="utf-8")
185
 
186
  def _wrap_paragraph(text: str, width: int) -> list[str]:
187
  if not text.strip():
188
  return [""]
189
  return textwrap.wrap(
190
- text.strip(),
191
  width=width,
192
  break_long_words=False,
193
  break_on_hyphens=False,
194
- # important: don't carry spaces across lines
195
- drop_whitespace=True, # ← was False
196
- replace_whitespace=True # use normal collapsing
197
  ) or [""]
198
 
199
  def _md_dialogue_to_lines(md_text: str, width: int) -> list[str]:
200
- """
201
- Turn the whole .md dialogue into a list of visual lines:
202
- - Bold speaker label only (T:/P:) on the first wrapped line of a paragraph.
203
- - Add a blank line between paragraphs.
204
- """
205
  md_text = md_text.replace("\r\n", "\n").replace("\r", "\n").strip("\n")
206
- paragraphs = re.split(r"\n\s*\n", md_text) # split on blank lines
207
- out: list[str] = []
208
-
209
  for p in paragraphs:
210
  p = p.strip()
211
  if not p:
212
- out.append("") # preserve blank paragraph as a blank numbered line
213
  continue
214
-
215
  m = SPEAKER_RE.match(p)
216
  if m:
217
  speaker, content = m.group(1), m.group(2)
218
  wrapped = _wrap_paragraph(content, width)
219
- # first line with bold speaker label
220
- first = f"<strong>{speaker}:</strong> {py_html.escape(wrapped[0])}".rstrip()
221
- out.append(first)
222
- # continuation lines without label
223
  for w in wrapped[1:]:
224
  out.append(py_html.escape(w))
225
  else:
226
- # plain paragraph (no speaker tag)
227
  for w in _wrap_paragraph(p, width):
228
  out.append(py_html.escape(w))
229
-
230
- # blank line after each paragraph
231
- out.append("")
232
- # drop trailing blank if you don't want an extra empty at the end
233
- if out and out[-1] == "":
234
- out.pop()
235
  return out
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  def display_numbered_dialogue(md_path: str,
238
  title: str = "Session Transcription",
239
  width_chars: int = 80,
240
- max_height_px: int = 520):
241
- """
242
- Render the .md dialogue like your screenshot:
243
- - left line numbers
244
- - wrapped to `width_chars`
245
- - scrolls inside the box
246
- """
247
- text = _read_md_any(md_path)
248
  lines = _md_dialogue_to_lines(text, width=width_chars)
249
 
250
  block_id = f"dlg-{uuid.uuid4().hex[:8]}"
251
- # build HTML rows
252
- rows_html = "\n".join(
253
- f"<div class='row'><span class='num'>{i}</span><span class='txt'>{ln or '&nbsp;'}</span></div>"
 
 
 
 
254
  for i, ln in enumerate(lines, 1)
255
  )
256
 
257
  st.markdown(f"""
258
- <style>
259
- #{block_id} {{
260
- max-height: {max_height_px}px; overflow-y: auto;
261
- font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, "Liberation Mono", monospace;
262
- line-height: 1.5;
263
- }}
264
- #{block_id} .row {{
265
- display: grid; grid-template-columns: 4ch 1fr; column-gap: 1ch; align-items: start;
266
- }}
267
- #{block_id} .num {{
268
- color: rgba(0,0,0,.55); text-align: right; user-select: none;
269
- }}
270
- /* The text column is visually constrained to `width_chars` characters */
271
- #{block_id} .txt {{
272
- max-width: {width_chars}ch; white-space: pre-wrap; word-break: break-word;
273
- }}
274
- #{block_id} strong {{ font-weight: 700; }}
275
- </style>
276
  """, unsafe_allow_html=True)
277
 
278
- with st.container(border=False):
279
- #st.markdown(f"**{title}** \n*(wrapped to {width_chars} chars; numbers by visual lines)*")
280
- st.markdown(f"<div id='{block_id}'>{rows_html}</div>", unsafe_allow_html=True)
281
 
282
  #################################### Streamlit App ####################################
283
 
 
4
  from typing import List, Optional, Dict
5
  from PIL import Image
6
 
7
+ import re, textwrap, uuid, html as py_html
 
 
 
8
  from pathlib import Path
9
 
10
  import numpy as np
 
167
  img = Image.open(f)
168
  st.image(img, caption='8 most contributing properties', use_column_width=True)
169
 
170
+ TEXT_STACK = "system-ui, -apple-system, 'Segoe UI', Roboto, Helvetica, Arial, 'Noto Sans', 'Liberation Sans', sans-serif"
171
+ MONO_STACK = "ui-monospace, SFMono-Regular, Menlo, Consolas, 'Liberation Mono', 'Roboto Mono', monospace"
172
+ SPEAKER_RE = re.compile(r'^\s*\*\*(T|P):\*\*\s*(.*)$')
173
+
174
 
175
  def _read_md_any(path: str) -> str:
176
+ full = path if path.startswith(input_repo_path) else f"{input_repo_path}/{path}"
 
 
 
177
  if filesystem == "hf":
178
  with hf_fs.open(full, "rb") as f:
179
  return f.read().decode("utf-8")
180
+ return Path(full).read_text(encoding="utf-8")
 
181
 
182
  def _wrap_paragraph(text: str, width: int) -> list[str]:
183
  if not text.strip():
184
  return [""]
185
  return textwrap.wrap(
186
+ text.strip().replace("\u00A0", " "), # normalize NBSP
187
  width=width,
188
  break_long_words=False,
189
  break_on_hyphens=False,
190
+ drop_whitespace=True, # <- prevents leading spaces on next line
191
+ replace_whitespace=True,
 
192
  ) or [""]
193
 
194
  def _md_dialogue_to_lines(md_text: str, width: int) -> list[str]:
 
 
 
 
 
195
  md_text = md_text.replace("\r\n", "\n").replace("\r", "\n").strip("\n")
196
+ paragraphs = re.split(r"\n\s*\n", md_text)
197
+ out = []
 
198
  for p in paragraphs:
199
  p = p.strip()
200
  if not p:
201
+ out.append("")
202
  continue
 
203
  m = SPEAKER_RE.match(p)
204
  if m:
205
  speaker, content = m.group(1), m.group(2)
206
  wrapped = _wrap_paragraph(content, width)
207
+ out.append(f"<strong>{speaker}:</strong> {py_html.escape(wrapped[0])}".rstrip())
 
 
 
208
  for w in wrapped[1:]:
209
  out.append(py_html.escape(w))
210
  else:
 
211
  for w in _wrap_paragraph(p, width):
212
  out.append(py_html.escape(w))
213
+ out.append("") # blank line between paragraphs
214
+ if out and out[-1] == "": out.pop()
 
 
 
 
215
  return out
216
 
217
+ # Inject global CSS once (prevents FOUC and font swaps)
218
+ def inject_dialogue_css_once():
219
+ if st.session_state.get("_dlg_css_injected"): return
220
+ st.session_state["_dlg_css_injected"] = True
221
+ st.markdown(f"""
222
+ <style id="dlg-css">
223
+ /* container styles (applied per instance inline) */
224
+
225
+ /* table layout is very stable across rerenders */
226
+ .dlg-table {{
227
+ border-collapse: separate;
228
+ border-spacing: 0 0;
229
+ table-layout: fixed; /* prevents column jitter */
230
+ width: max-content;
231
+ max-width: 100%;
232
+ }}
233
+ .dlg-row td {{
234
+ vertical-align: top;
235
+ padding: 0;
236
+ }}
237
+ .dlg-num {{
238
+ width: 4ch; /* fixed gutter */
239
+ padding-right: 1ch;
240
+ text-align: right;
241
+ color: rgba(0,0,0,.55);
242
+ user-select: none;
243
+ font-family: {MONO_STACK};
244
+ font-variant-numeric: tabular-nums;
245
+ }}
246
+ .dlg-txt {{
247
+ white-space: pre-wrap;
248
+ word-break: break-word;
249
+ }}
250
+ .dlg-txt strong {{ font-weight: 700; }}
251
+ /* keep rendering consistent */
252
+ .dlg-root {{
253
+ -webkit-font-smoothing: antialiased;
254
+ -moz-osx-font-smoothing: grayscale;
255
+ font-synthesis: none;
256
+ }}
257
+ </style>
258
+ """, unsafe_allow_html=True)
259
+
260
  def display_numbered_dialogue(md_path: str,
261
  title: str = "Session Transcription",
262
  width_chars: int = 80,
263
+ max_height_px: int = 520,
264
+ font_family: str = TEXT_STACK,
265
+ font_size: str = "1rem",
266
+ show_border: bool = False,
267
+ background: str = "transparent"):
268
+ inject_dialogue_css_once()
269
+ text = _read_md_any(md_path)
 
270
  lines = _md_dialogue_to_lines(text, width=width_chars)
271
 
272
  block_id = f"dlg-{uuid.uuid4().hex[:8]}"
273
+ border_css = "1px solid #e6e6e6" if show_border else "none"
274
+ radius_css = ".6rem" if show_border else "0"
275
+ pad_css = ".8rem 1rem" if show_border else "0"
276
+
277
+ # container + table (no CSS grid)
278
+ rows = "\n".join(
279
+ f"<tr class='dlg-row'><td class='dlg-num'>{i}</td><td class='dlg-txt'>{ln or '&nbsp;'}</td></tr>"
280
  for i, ln in enumerate(lines, 1)
281
  )
282
 
283
  st.markdown(f"""
284
+ <div id="{block_id}" class="dlg-root"
285
+ style="border:{border_css}; border-radius:{radius_css}; padding:{pad_css};
286
+ background:{background}; max-height:{max_height_px}px; overflow-y:auto;">
287
+ <table class="dlg-table" style="font-family:{font_family}; font-size:{font_size};">
288
+ {rows}
289
+ </table>
290
+ </div>
 
 
 
 
 
 
 
 
 
 
 
291
  """, unsafe_allow_html=True)
292
 
 
 
 
293
 
294
  #################################### Streamlit App ####################################
295