sharick008 commited on
Commit
e7e92c8
·
verified ·
1 Parent(s): f11c2aa

ui: warn when a record's table has cleaning artefacts (non-numeric cells, duplicate columns)

Browse files
Files changed (1) hide show
  1. app.py +22 -4
app.py CHANGED
@@ -127,16 +127,34 @@ def render_document(pre_text, table, post_text):
127
  )
128
 
129
 
130
- def render_document_markdown(doc):
131
  """Markdown view for the UI. Cleaner than the XML wrapping the model sees."""
 
 
132
  table, cols = doc["table"], list(doc["table"])
133
  rows = list(dict.fromkeys(r for col in table.values() for r in col))
134
  md = ["| | " + " | ".join(cols) + " |", "|---" * (len(cols) + 1) + "|"]
135
  for row in rows:
136
  cells = [str(table[c].get(row, "")) for c in cols]
137
  md.append(f"| {row} | " + " | ".join(cells) + " |")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  return (
139
- "**Pre-text:** \n" + doc["pre_text"]
 
140
  + "\n\n**Table:** \n" + "\n".join(md)
141
  + "\n\n**Post-text:** \n" + doc["post_text"]
142
  )
@@ -260,7 +278,7 @@ def replay_record(record_id, progress=gr.Progress()):
260
  if not record_id:
261
  return "Pick a record from the dropdown.", None, "—"
262
  record = DEV_RECORDS[record_id]
263
- doc_md = render_document_markdown(record["doc"])
264
  sys_msg = {
265
  "role": "system",
266
  "content": SYSTEM_PROMPT_PREFIX + render_document(
@@ -294,7 +312,7 @@ def chat_load_doc(record_id):
294
  if not record_id:
295
  return "Pick a record from the dropdown.", []
296
  record = DEV_RECORDS[record_id]
297
- return render_document_markdown(record["doc"]), []
298
 
299
 
300
  def chat_respond(user_msg, history, record_id):
 
127
  )
128
 
129
 
130
+ def render_document_markdown(record):
131
  """Markdown view for the UI. Cleaner than the XML wrapping the model sees."""
132
+ doc = record["doc"]
133
+ feats = record.get("features", {})
134
  table, cols = doc["table"], list(doc["table"])
135
  rows = list(dict.fromkeys(r for col in table.values() for r in col))
136
  md = ["| | " + " | ".join(cols) + " |", "|---" * (len(cols) + 1) + "|"]
137
  for row in rows:
138
  cells = [str(table[c].get(row, "")) for c in cols]
139
  md.append(f"| {row} | " + " | ".join(cells) + " |")
140
+ notes = []
141
+ if feats.get("has_non_numeric_values"):
142
+ notes.append(
143
+ "non-numeric cells (the upstream cleaner sometimes folds first-row "
144
+ "values into the column-name string)"
145
+ )
146
+ if feats.get("has_duplicate_columns"):
147
+ notes.append("duplicate column headers not fully disambiguated by cleaning")
148
+ warning = ""
149
+ if notes:
150
+ warning = (
151
+ "_Note: this record's `features` flags it as having "
152
+ + " and ".join(notes)
153
+ + ". The table below is exactly what the model sees, artefacts and all._\n\n"
154
+ )
155
  return (
156
+ warning
157
+ + "**Pre-text:** \n" + doc["pre_text"]
158
  + "\n\n**Table:** \n" + "\n".join(md)
159
  + "\n\n**Post-text:** \n" + doc["post_text"]
160
  )
 
278
  if not record_id:
279
  return "Pick a record from the dropdown.", None, "—"
280
  record = DEV_RECORDS[record_id]
281
+ doc_md = render_document_markdown(record)
282
  sys_msg = {
283
  "role": "system",
284
  "content": SYSTEM_PROMPT_PREFIX + render_document(
 
312
  if not record_id:
313
  return "Pick a record from the dropdown.", []
314
  record = DEV_RECORDS[record_id]
315
+ return render_document_markdown(record), []
316
 
317
 
318
  def chat_respond(user_msg, history, record_id):