Spaces:
Build error
Build error
File size: 16,518 Bytes
37859cc 0eff467 37859cc a8e9b17 37859cc 01260a9 37859cc 01260a9 37859cc 01260a9 0eff467 01260a9 37859cc 0519f68 37859cc 409c1a5 0519f68 d8fdeae a8e9b17 0519f68 409c1a5 0519f68 01260a9 0519f68 aac865c 409c1a5 8dcc7c1 409c1a5 68c5adb aac865c 8b0f06f d8fdeae 0eff467 eeb7444 9a3d32b 0eff467 b9c537c 6321be3 eeb7444 236f5b5 6321be3 eeb7444 9a3d32b 65de223 0eff467 37859cc 3257200 37859cc a8e9b17 37859cc 0eff467 37859cc 0eff467 37859cc d8fdeae 37859cc 0eff467 c4651e6 2a4b9b8 6965edc 0eff467 2a4b9b8 3257200 0eff467 c8cae3a 0eff467 9a3d32b 0eff467 c8cae3a 9a3d32b 3257200 37859cc 1e023d5 37859cc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 | #!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re, io, os
import gradio as gr
import pandas as pd
# docx
try:
from docx import Document
DOCX_OK = True
except Exception:
DOCX_OK = False
# ocr
try:
from PIL import Image, ImageOps
import pytesseract
OCR_OK = True
except Exception:
OCR_OK = False
# ---------- helpers ----------
def _norm_name(s: str) -> str:
import re
return re.sub(r"[^a-z0-9]", "", str(s).lower())
def _pick_col(cols, *cands):
m = { _norm_name(c): c for c in cols }
for cand in cands:
for k, orig in m.items():
if cand in k:
return orig
return None
def _coerce_numeric(x):
if pd.isna(x): return x
if isinstance(x, (int,float)): return float(x)
s = str(x).replace(",","").replace("$","").strip()
try: return float(s)
except: return pd.NA
PARAM_PATTERNS = {
"cost": r"cost\s*[:=]\s*\$?\s*([\d,]+(?:\.\d+)?)",
"salvage": r"salvage\s*[:=]\s*\$?\s*([\d,]+(?:\.\d+)?)",
"life": r"(?:life|useful\s*life)\s*[:=]\s*([\d,]+)",
"start_year": r"(?:start\s*year|start)\s*[:=]\s*([12]\d{3})",
}
def _extract_params(text: str):
vals = {}
low = (text or "").lower()
for k, pat in PARAM_PATTERNS.items():
m = re.search(pat, low, flags=re.I)
if m:
raw = m.group(1).replace(",", "")
vals[k] = float(raw) if k in ("cost","salvage") else int(float(raw))
return vals
def _docx_to_table_and_text(fileobj) -> tuple[pd.DataFrame|None, str]:
if not DOCX_OK:
return None, "(python-docx not available)"
try:
doc = Document(fileobj)
except Exception as e:
return None, f"[docx open failed] {e}"
# collect paragraphs (for param scraping)
all_text = "\n".join(p.text for p in doc.paragraphs)
# try to find a depreciation table
for t in doc.tables:
rows = [[c.text.strip() for c in r.cells] for r in t.rows]
if not rows:
continue
hdr = rows[0]
if len(hdr) >= 4 and any("year" in _norm_name(h) for h in hdr):
df = pd.DataFrame(rows[1:], columns=hdr)
df = df[~(df.astype(str).apply(lambda r: "".join(r), axis=1).str.strip() == "")]
if not df.empty:
return df, all_text
return None, all_text
def _image_to_text(img: Image.Image) -> str:
if not OCR_OK:
return "(pytesseract not available)"
try:
img = ImageOps.exif_transpose(img)
gray = ImageOps.grayscale(img)
return pytesseract.image_to_string(gray)
except Exception as e:
return f"[ocr failed] {e}"
def _table_from_ocr_text(text: str) -> pd.DataFrame|None:
if not text or not text.strip():
return None
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
hdr_i = -1
for i, ln in enumerate(lines):
low = ln.lower()
if ("year" in low and "begin" in low and "dep" in low and "end" in low):
hdr_i = i
break
if hdr_i == -1:
for i, ln in enumerate(lines):
parts = re.split(r"\s{2,}|\t+", ln)
low = ln.lower()
if len([p for p in parts if p.strip()]) >= 4 and any(k in low for k in ["year","begin","dep","end"]):
hdr_i = i
break
if hdr_i == -1:
return None
header = [h.strip() for h in re.split(r"\s{2,}|\t+", lines[hdr_i]) if h.strip()]
data = []
for ln in lines[hdr_i+1:]:
parts = [p.strip() for p in re.split(r"\s{2,}|\t+", ln) if p.strip()]
if len(parts) == len(header):
data.append(parts)
else:
if len(data) >= 1:
break
if not data:
return None
return pd.DataFrame(data, columns=header)
def _normalize_depr_columns(df_in: pd.DataFrame) -> pd.DataFrame:
df = df_in.copy()
out = pd.DataFrame()
c_year = _pick_col(df.columns, "year")
c_beg = _pick_col(df.columns, "beginbv","beginningbv","beginbook","begin","beginningvalue")
c_dep = _pick_col(df.columns, "depreciation","dep")
c_acc = _pick_col(df.columns, "accumdep","accumulateddep","accum","accdep")
c_end = _pick_col(df.columns, "endbv","endingbv","endbook","end","endingvalue")
out["Year"] = df[c_year] if c_year else pd.NA
out["Begin BV"] = df[c_beg] if c_beg else pd.NA
out["Depreciation"] = df[c_dep] if c_dep else pd.NA
out["Accum Dep"] = df[c_acc] if c_acc else pd.NA
out["End BV"] = df[c_end] if c_end else pd.NA
out["Year"] = pd.to_numeric(out["Year"], errors="coerce")
for col in ["Begin BV","Depreciation","Accum Dep","End BV"]:
out[col] = out[col].map(_coerce_numeric)
out = out[~out[["Begin BV","Depreciation","Accum Dep","End BV"]].isna().all(axis=1)].reset_index(drop=True)
return out
# Monday Aug 11 New helpers
def build_sl_schedule(cost: float, salvage: float, life: int, start_year: int):
dep = (cost - salvage) / life
years = [start_year + i for i in range(life)]
begin_bv, dep_col, accum, end_bv = [], [], [], []
b = cost
acc = 0.0
for _ in years:
begin_bv.append(b)
dep_col.append(dep)
acc += dep
accum.append(acc)
b = b - dep
end_bv.append(b)
out = pd.DataFrame(
{
"Year": years,
"Begin BV": begin_bv,
"Depreciation": dep_col,
"Accum Dep": accum,
"End BV": end_bv,
}
)
return out
def audit_against_expected(expected: pd.DataFrame, actual: pd.DataFrame):
if actual is None or actual.empty:
return pd.DataFrame(), "No student table found to check."
merged = expected.merge(
actual[["Year","Begin BV","Depreciation","Accum Dep","End BV"]],
on="Year", how="inner", suffixes=("_exp","_act")
)
if merged.empty:
return pd.DataFrame(), "No matching years between expected and uploaded table."
deltas = pd.DataFrame({"Year": merged["Year"]})
for c in ["Begin BV","Depreciation","Accum Dep","End BV"]:
deltas[c + " Δ"] = merged[f"{c}_act"] - merged[f"{c}_exp"]
first_bad = None
for _, r in deltas.iterrows():
if any(abs(r[col]) > 1e-6 for col in deltas.columns if col.endswith("Δ")):
first_bad = int(r["Year"])
break
msg = (
"All good 🎉 Straight‑line matches your table."
if first_bad is None
else f"First mismatch at year {first_bad}. Remember: Dep=(Cost−Salvage)/Life and Accum_t=Accum_(t−1)+Dep."
)
return deltas, msg
# ---------- Gradio callbacks ----------
def _params_tuple(p):
p = p or {}
return (
float(p.get("cost", 0.0)),
float(p.get("salvage", 0.0)),
int(p.get("life", 10)),
int(p.get("start_year", pd.Timestamp.now().year)),
)
def handle_docx(file):
if file is None:
return "(no file)", {}, pd.DataFrame(), 0.0, 0.0, 10, pd.Timestamp.now().year, {}, pd.DataFrame()
df_raw, header = _docx_to_table_and_text(file.name if hasattr(file, "name") else file)
params = _extract_params(header or "")
df_norm = _normalize_depr_columns(df_raw) if df_raw is not None else None
cost, salv, life, year = _params_tuple(params)
return (
header or "(no text found)",
params,
(df_norm if df_norm is not None else pd.DataFrame()),
cost, salv, life, year,
params,
(df_norm if df_norm is not None else pd.DataFrame()),
)
#def handle_image(img):
# if img is None:
# return "(no image)", {}, None, pd.DataFrame(), 0.0, 0.0, 10, pd.Timestamp.now().year, {}, pd.DataFrame()
# from PIL import Image as PILImage
# pil = img if isinstance(img, PILImage.Image) else PILImage.fromarray(img)
# ocr_text = _image_to_text(pil)
# params = _extract_params(ocr_text or "")
# df_raw = _table_from_ocr_text(ocr_text or "")
# df_norm = _normalize_depr_columns(df_raw) if df_raw is not None else None
# cost, salv, life, year = _params_tuple(params)
# return (
# ocr_text or "(empty OCR)",
# params,
# df_raw,
# (df_norm if df_norm is not None else pd.DataFrame()),
# cost, salv, life, year,
# params,
# (df_norm if df_norm is not None else pd.DataFrame()),
# )
def handle_image(img):
if img is None:
return "(no image)", {}, pd.DataFrame(), pd.DataFrame(), 0.0, 0.0, 10, pd.Timestamp.now().year, {}, pd.DataFrame()
from PIL import Image as PILImage
pil = img if isinstance(img, PILImage.Image) else PILImage.fromarray(img)
ocr_text = _image_to_text(pil)
params = _extract_params(ocr_text or "")
df_raw = _table_from_ocr_text(ocr_text or "")
df_norm = _normalize_depr_columns(df_raw) if df_raw is not None else pd.DataFrame()
cost, salv, life, year = _params_tuple(params)
return (
ocr_text or "(empty OCR)",
params,
df_raw, # raw table shown in OCR tab
df_norm, # normalized table shown in OCR tab
cost, salv, life, year, # auto-fill numbers
params, # save params state
df_raw # 🔹 save normalized table to last_table (same as docx)
)
def fill_from_state(p):
p = p or {}
return (
float(p.get("cost", 0.0)),
float(p.get("salvage", 0.0)),
int(p.get("life", 10)),
int(p.get("start_year", pd.Timestamp.now().year)),
)
def build_cb(cost, salv, life, year):
try:
df = build_sl_schedule(float(cost), float(salv), int(life), int(year))
except Exception as e:
return pd.DataFrame([{"error": str(e)}])
return df
#def check_cb(cost, salv, life, year, table_state):
# # expected (numeric)
# exp = build_sl_schedule(float(cost), float(salv), int(life), int(year))
# exp["Year"] = pd.to_numeric(exp["Year"], errors="coerce")
# nothing to check?
# if not isinstance(table_state, pd.DataFrame) or table_state.empty:
# return pd.DataFrame(), "No student table found to check."
# 👇 Gradio returns strings → re-normalize and coerce here every time
# actual = _normalize_depr_columns(table_state)
# for c in ["Year", "Begin BV", "Depreciation", "Accum Dep", "End BV"]:
# actual[c] = pd.to_numeric(actual[c], errors="coerce")
# actual = actual.dropna(subset=["Year"]).reset_index(drop=True)
# deltas, msg = audit_against_expected(exp, actual)
# return deltas, msg
def check_cb(cost, salv, life, year, table_state):
exp = build_sl_schedule(float(cost), float(salv), int(life), int(year))
exp["Year"] = pd.to_numeric(exp["Year"], errors="coerce")
# Accept pd.DataFrame OR list-of-lists/dicts from Gradio
actual = table_state
if isinstance(actual, list):
# best effort columns; normalize later
actual = pd.DataFrame(actual)
elif not isinstance(actual, pd.DataFrame):
return pd.DataFrame(), "No student table found to check."
# normalize columns & numeric coercion
actual = _normalize_depr_columns(actual)
for c in ["Year","Begin BV","Depreciation","Accum Dep","End BV"]:
actual[c] = pd.to_numeric(actual[c], errors="coerce")
actual = actual.dropna(subset=["Year"]).reset_index(drop=True)
deltas, msg = audit_against_expected(exp, actual)
return deltas, msg
# --- Debug utilities ---
def debug_dump(ocr_text, params, raw_tbl, norm_tbl, last_tbl, image):
import pandas as pd, io
def df_summary(name, df):
if isinstance(df, pd.DataFrame) and not df.empty:
head = df.head(5).to_string(index=False)
return f"**{name}**: shape={df.shape}, cols={list(df.columns)}\n```\n{head}\n```"
return f"**{name}**: {type(df).__name__} (empty or not a DataFrame)"
lines = []
lines.append(f"**OCR text length**: {len(ocr_text or '')}")
lines.append(f"**Params keys**: {sorted(list((params or {}).keys()))}")
lines.append(df_summary("raw_df (Tab 2)", raw_tbl))
lines.append(df_summary("norm_df (Tab 2)", norm_tbl))
lines.append(df_summary("last_table (State)", last_tbl))
report = "\n\n".join(lines)
# Return the report and echoes of the DFs and image for visual confirmation
# (use empty DataFrames if inputs aren't DataFrames)
def ensure_df(x):
return x if isinstance(x, pd.DataFrame) else pd.DataFrame()
return (
report,
ensure_df(raw_tbl),
ensure_df(norm_tbl),
ensure_df(last_tbl),
image # echo the image
)
# ---------- UI ----------
with gr.Blocks(title="Jerry • HW Intake (Echo)") as demo:
last_params = gr.State({})
last_table = gr.State(pd.DataFrame())
gr.Markdown("## Jerry (TA) – Homework Intake\nThis Space **only reads and echoes** your files.\nNext step will add solving & coaching.")
# --- Tab 1: DOCX ---
with gr.Tab("Upload .docx"):
docx_in = gr.File(file_types=[".docx"], label="Homework .docx")
btn1 = gr.Button("Read")
header_txt = gr.Textbox(label="Header/Text (for params)", lines=8)
params_json = gr.JSON(label="Detected parameters")
table_df = gr.Dataframe(label="Detected table (normalized)", interactive=False)
# --- Tab 2: Image ---
with gr.Tab("Upload Image (.png/.jpg)"):
img_in = gr.Image(type="pil", label="Photo or screenshot of your table")
btn2 = gr.Button("OCR")
ocr_txt = gr.Textbox(label="Raw OCR text", lines=12)
params_json2 = gr.JSON(label="Detected parameters")
raw_df = gr.Dataframe(label="Raw table guess", interactive=False)
norm_df = gr.Dataframe(label="Detected table (normalized)", interactive=False)
# --- Tab 3: Solve & Check ---
with gr.Tab("Straight-Line • Solve & Check"):
gr.Markdown("Enter params (auto-filled if detected) → build the correct SL schedule → compare to your uploaded table.")
with gr.Row():
in_cost = gr.Number(label="Cost", value=0.0)
in_salv = gr.Number(label="Salvage", value=0.0)
in_life = gr.Number(label="Life (years)", value=10, precision=0)
in_year = gr.Number(label="Start year", value=2025, precision=0)
btn_use = gr.Button("Use detected params")
btn_build = gr.Button("Build expected schedule")
expected_df = gr.Dataframe(label="Expected (SL) schedule", interactive=False)
btn_check = gr.Button("Check against uploaded table")
deltas_df = gr.Dataframe(label="Differences (student − expected)", interactive=False)
coach_txt = gr.Markdown()
with gr.Tab("Debug"):
dbg_btn = gr.Button("Dump OCR state")
dbg_md = gr.Markdown()
dbg_raw = gr.Dataframe(label="raw_df echo", interactive=False)
dbg_norm = gr.Dataframe(label="norm_df echo", interactive=False)
dbg_last = gr.Dataframe(label="last_table (State) echo", interactive=False)
dbg_img = gr.Image(label="Image echo")
# ---------- Wire events AFTER all components exist ----------
btn1.click(
handle_docx,
inputs=docx_in,
outputs=[
header_txt, # text
params_json, # json
table_df, # normalized table (tab 1)
in_cost, in_salv, in_life, in_year, # autofill inputs
last_params, # state
last_table, # state
],
)
btn2.click(
handle_image,
inputs=img_in,
outputs=[
ocr_txt, # raw OCR text
params_json2, # json
raw_df, # raw table
norm_df, # normalized table (tab 2)
in_cost, in_salv, in_life, in_year, # autofill inputs
last_params, # state
last_table, # state
],
)
btn_build.click(build_cb, [in_cost, in_salv, in_life, in_year], [expected_df])
btn_check.click(check_cb, [in_cost, in_salv, in_life, in_year, last_table], [deltas_df, coach_txt])
dbg_btn.click(
debug_dump,
# inputs come from the components/state already populated by handle_image
inputs=[ocr_txt, params_json2, raw_df, norm_df, last_table, img_in],
outputs=[dbg_md, dbg_raw, dbg_norm, dbg_last, dbg_img],
)
gr.Markdown("— Echo mode finished. When this looks good, we’ll plug in the SL solver + coaching.")
if __name__ == "__main__":
demo.launch()
|