Spaces:
Sleeping
Sleeping
ui: warn when a record's table has cleaning artefacts (non-numeric cells, duplicate columns)
Browse files
app.py
CHANGED
|
@@ -127,16 +127,34 @@ def render_document(pre_text, table, post_text):
|
|
| 127 |
)
|
| 128 |
|
| 129 |
|
| 130 |
-
def render_document_markdown(
|
| 131 |
"""Markdown view for the UI. Cleaner than the XML wrapping the model sees."""
|
|
|
|
|
|
|
| 132 |
table, cols = doc["table"], list(doc["table"])
|
| 133 |
rows = list(dict.fromkeys(r for col in table.values() for r in col))
|
| 134 |
md = ["| | " + " | ".join(cols) + " |", "|---" * (len(cols) + 1) + "|"]
|
| 135 |
for row in rows:
|
| 136 |
cells = [str(table[c].get(row, "")) for c in cols]
|
| 137 |
md.append(f"| {row} | " + " | ".join(cells) + " |")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
return (
|
| 139 |
-
|
|
|
|
| 140 |
+ "\n\n**Table:** \n" + "\n".join(md)
|
| 141 |
+ "\n\n**Post-text:** \n" + doc["post_text"]
|
| 142 |
)
|
|
@@ -260,7 +278,7 @@ def replay_record(record_id, progress=gr.Progress()):
|
|
| 260 |
if not record_id:
|
| 261 |
return "Pick a record from the dropdown.", None, "—"
|
| 262 |
record = DEV_RECORDS[record_id]
|
| 263 |
-
doc_md = render_document_markdown(record
|
| 264 |
sys_msg = {
|
| 265 |
"role": "system",
|
| 266 |
"content": SYSTEM_PROMPT_PREFIX + render_document(
|
|
@@ -294,7 +312,7 @@ def chat_load_doc(record_id):
|
|
| 294 |
if not record_id:
|
| 295 |
return "Pick a record from the dropdown.", []
|
| 296 |
record = DEV_RECORDS[record_id]
|
| 297 |
-
return render_document_markdown(record
|
| 298 |
|
| 299 |
|
| 300 |
def chat_respond(user_msg, history, record_id):
|
|
|
|
| 127 |
)
|
| 128 |
|
| 129 |
|
| 130 |
+
def render_document_markdown(record):
|
| 131 |
"""Markdown view for the UI. Cleaner than the XML wrapping the model sees."""
|
| 132 |
+
doc = record["doc"]
|
| 133 |
+
feats = record.get("features", {})
|
| 134 |
table, cols = doc["table"], list(doc["table"])
|
| 135 |
rows = list(dict.fromkeys(r for col in table.values() for r in col))
|
| 136 |
md = ["| | " + " | ".join(cols) + " |", "|---" * (len(cols) + 1) + "|"]
|
| 137 |
for row in rows:
|
| 138 |
cells = [str(table[c].get(row, "")) for c in cols]
|
| 139 |
md.append(f"| {row} | " + " | ".join(cells) + " |")
|
| 140 |
+
notes = []
|
| 141 |
+
if feats.get("has_non_numeric_values"):
|
| 142 |
+
notes.append(
|
| 143 |
+
"non-numeric cells (the upstream cleaner sometimes folds first-row "
|
| 144 |
+
"values into the column-name string)"
|
| 145 |
+
)
|
| 146 |
+
if feats.get("has_duplicate_columns"):
|
| 147 |
+
notes.append("duplicate column headers not fully disambiguated by cleaning")
|
| 148 |
+
warning = ""
|
| 149 |
+
if notes:
|
| 150 |
+
warning = (
|
| 151 |
+
"_Note: this record's `features` flags it as having "
|
| 152 |
+
+ " and ".join(notes)
|
| 153 |
+
+ ". The table below is exactly what the model sees, artefacts and all._\n\n"
|
| 154 |
+
)
|
| 155 |
return (
|
| 156 |
+
warning
|
| 157 |
+
+ "**Pre-text:** \n" + doc["pre_text"]
|
| 158 |
+ "\n\n**Table:** \n" + "\n".join(md)
|
| 159 |
+ "\n\n**Post-text:** \n" + doc["post_text"]
|
| 160 |
)
|
|
|
|
| 278 |
if not record_id:
|
| 279 |
return "Pick a record from the dropdown.", None, "—"
|
| 280 |
record = DEV_RECORDS[record_id]
|
| 281 |
+
doc_md = render_document_markdown(record)
|
| 282 |
sys_msg = {
|
| 283 |
"role": "system",
|
| 284 |
"content": SYSTEM_PROMPT_PREFIX + render_document(
|
|
|
|
| 312 |
if not record_id:
|
| 313 |
return "Pick a record from the dropdown.", []
|
| 314 |
record = DEV_RECORDS[record_id]
|
| 315 |
+
return render_document_markdown(record), []
|
| 316 |
|
| 317 |
|
| 318 |
def chat_respond(user_msg, history, record_id):
|