jeffrey1963's picture
Update app.py
1e023d5 verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re, io, os
import gradio as gr
import pandas as pd
# docx
try:
from docx import Document
DOCX_OK = True
except Exception:
DOCX_OK = False
# ocr
try:
from PIL import Image, ImageOps
import pytesseract
OCR_OK = True
except Exception:
OCR_OK = False
# ---------- helpers ----------
def _norm_name(s: str) -> str:
import re
return re.sub(r"[^a-z0-9]", "", str(s).lower())
def _pick_col(cols, *cands):
m = { _norm_name(c): c for c in cols }
for cand in cands:
for k, orig in m.items():
if cand in k:
return orig
return None
def _coerce_numeric(x):
if pd.isna(x): return x
if isinstance(x, (int,float)): return float(x)
s = str(x).replace(",","").replace("$","").strip()
try: return float(s)
except: return pd.NA
PARAM_PATTERNS = {
"cost": r"cost\s*[:=]\s*\$?\s*([\d,]+(?:\.\d+)?)",
"salvage": r"salvage\s*[:=]\s*\$?\s*([\d,]+(?:\.\d+)?)",
"life": r"(?:life|useful\s*life)\s*[:=]\s*([\d,]+)",
"start_year": r"(?:start\s*year|start)\s*[:=]\s*([12]\d{3})",
}
def _extract_params(text: str):
vals = {}
low = (text or "").lower()
for k, pat in PARAM_PATTERNS.items():
m = re.search(pat, low, flags=re.I)
if m:
raw = m.group(1).replace(",", "")
vals[k] = float(raw) if k in ("cost","salvage") else int(float(raw))
return vals
def _docx_to_table_and_text(fileobj) -> tuple[pd.DataFrame|None, str]:
if not DOCX_OK:
return None, "(python-docx not available)"
try:
doc = Document(fileobj)
except Exception as e:
return None, f"[docx open failed] {e}"
# collect paragraphs (for param scraping)
all_text = "\n".join(p.text for p in doc.paragraphs)
# try to find a depreciation table
for t in doc.tables:
rows = [[c.text.strip() for c in r.cells] for r in t.rows]
if not rows:
continue
hdr = rows[0]
if len(hdr) >= 4 and any("year" in _norm_name(h) for h in hdr):
df = pd.DataFrame(rows[1:], columns=hdr)
df = df[~(df.astype(str).apply(lambda r: "".join(r), axis=1).str.strip() == "")]
if not df.empty:
return df, all_text
return None, all_text
def _image_to_text(img: Image.Image) -> str:
if not OCR_OK:
return "(pytesseract not available)"
try:
img = ImageOps.exif_transpose(img)
gray = ImageOps.grayscale(img)
return pytesseract.image_to_string(gray)
except Exception as e:
return f"[ocr failed] {e}"
def _table_from_ocr_text(text: str) -> pd.DataFrame|None:
if not text or not text.strip():
return None
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
hdr_i = -1
for i, ln in enumerate(lines):
low = ln.lower()
if ("year" in low and "begin" in low and "dep" in low and "end" in low):
hdr_i = i
break
if hdr_i == -1:
for i, ln in enumerate(lines):
parts = re.split(r"\s{2,}|\t+", ln)
low = ln.lower()
if len([p for p in parts if p.strip()]) >= 4 and any(k in low for k in ["year","begin","dep","end"]):
hdr_i = i
break
if hdr_i == -1:
return None
header = [h.strip() for h in re.split(r"\s{2,}|\t+", lines[hdr_i]) if h.strip()]
data = []
for ln in lines[hdr_i+1:]:
parts = [p.strip() for p in re.split(r"\s{2,}|\t+", ln) if p.strip()]
if len(parts) == len(header):
data.append(parts)
else:
if len(data) >= 1:
break
if not data:
return None
return pd.DataFrame(data, columns=header)
def _normalize_depr_columns(df_in: pd.DataFrame) -> pd.DataFrame:
df = df_in.copy()
out = pd.DataFrame()
c_year = _pick_col(df.columns, "year")
c_beg = _pick_col(df.columns, "beginbv","beginningbv","beginbook","begin","beginningvalue")
c_dep = _pick_col(df.columns, "depreciation","dep")
c_acc = _pick_col(df.columns, "accumdep","accumulateddep","accum","accdep")
c_end = _pick_col(df.columns, "endbv","endingbv","endbook","end","endingvalue")
out["Year"] = df[c_year] if c_year else pd.NA
out["Begin BV"] = df[c_beg] if c_beg else pd.NA
out["Depreciation"] = df[c_dep] if c_dep else pd.NA
out["Accum Dep"] = df[c_acc] if c_acc else pd.NA
out["End BV"] = df[c_end] if c_end else pd.NA
out["Year"] = pd.to_numeric(out["Year"], errors="coerce")
for col in ["Begin BV","Depreciation","Accum Dep","End BV"]:
out[col] = out[col].map(_coerce_numeric)
out = out[~out[["Begin BV","Depreciation","Accum Dep","End BV"]].isna().all(axis=1)].reset_index(drop=True)
return out
# Monday Aug 11 New helpers
def build_sl_schedule(cost: float, salvage: float, life: int, start_year: int):
dep = (cost - salvage) / life
years = [start_year + i for i in range(life)]
begin_bv, dep_col, accum, end_bv = [], [], [], []
b = cost
acc = 0.0
for _ in years:
begin_bv.append(b)
dep_col.append(dep)
acc += dep
accum.append(acc)
b = b - dep
end_bv.append(b)
out = pd.DataFrame(
{
"Year": years,
"Begin BV": begin_bv,
"Depreciation": dep_col,
"Accum Dep": accum,
"End BV": end_bv,
}
)
return out
def audit_against_expected(expected: pd.DataFrame, actual: pd.DataFrame):
if actual is None or actual.empty:
return pd.DataFrame(), "No student table found to check."
merged = expected.merge(
actual[["Year","Begin BV","Depreciation","Accum Dep","End BV"]],
on="Year", how="inner", suffixes=("_exp","_act")
)
if merged.empty:
return pd.DataFrame(), "No matching years between expected and uploaded table."
deltas = pd.DataFrame({"Year": merged["Year"]})
for c in ["Begin BV","Depreciation","Accum Dep","End BV"]:
deltas[c + " Δ"] = merged[f"{c}_act"] - merged[f"{c}_exp"]
first_bad = None
for _, r in deltas.iterrows():
if any(abs(r[col]) > 1e-6 for col in deltas.columns if col.endswith("Δ")):
first_bad = int(r["Year"])
break
msg = (
"All good 🎉 Straight‑line matches your table."
if first_bad is None
else f"First mismatch at year {first_bad}. Remember: Dep=(Cost−Salvage)/Life and Accum_t=Accum_(t−1)+Dep."
)
return deltas, msg
# ---------- Gradio callbacks ----------
def _params_tuple(p):
p = p or {}
return (
float(p.get("cost", 0.0)),
float(p.get("salvage", 0.0)),
int(p.get("life", 10)),
int(p.get("start_year", pd.Timestamp.now().year)),
)
def handle_docx(file):
if file is None:
return "(no file)", {}, pd.DataFrame(), 0.0, 0.0, 10, pd.Timestamp.now().year, {}, pd.DataFrame()
df_raw, header = _docx_to_table_and_text(file.name if hasattr(file, "name") else file)
params = _extract_params(header or "")
df_norm = _normalize_depr_columns(df_raw) if df_raw is not None else None
cost, salv, life, year = _params_tuple(params)
return (
header or "(no text found)",
params,
(df_norm if df_norm is not None else pd.DataFrame()),
cost, salv, life, year,
params,
(df_norm if df_norm is not None else pd.DataFrame()),
)
#def handle_image(img):
# if img is None:
# return "(no image)", {}, None, pd.DataFrame(), 0.0, 0.0, 10, pd.Timestamp.now().year, {}, pd.DataFrame()
# from PIL import Image as PILImage
# pil = img if isinstance(img, PILImage.Image) else PILImage.fromarray(img)
# ocr_text = _image_to_text(pil)
# params = _extract_params(ocr_text or "")
# df_raw = _table_from_ocr_text(ocr_text or "")
# df_norm = _normalize_depr_columns(df_raw) if df_raw is not None else None
# cost, salv, life, year = _params_tuple(params)
# return (
# ocr_text or "(empty OCR)",
# params,
# df_raw,
# (df_norm if df_norm is not None else pd.DataFrame()),
# cost, salv, life, year,
# params,
# (df_norm if df_norm is not None else pd.DataFrame()),
# )
def handle_image(img):
if img is None:
return "(no image)", {}, pd.DataFrame(), pd.DataFrame(), 0.0, 0.0, 10, pd.Timestamp.now().year, {}, pd.DataFrame()
from PIL import Image as PILImage
pil = img if isinstance(img, PILImage.Image) else PILImage.fromarray(img)
ocr_text = _image_to_text(pil)
params = _extract_params(ocr_text or "")
df_raw = _table_from_ocr_text(ocr_text or "")
df_norm = _normalize_depr_columns(df_raw) if df_raw is not None else pd.DataFrame()
cost, salv, life, year = _params_tuple(params)
return (
ocr_text or "(empty OCR)",
params,
df_raw, # raw table shown in OCR tab
df_norm, # normalized table shown in OCR tab
cost, salv, life, year, # auto-fill numbers
params, # save params state
df_raw # 🔹 save normalized table to last_table (same as docx)
)
def fill_from_state(p):
p = p or {}
return (
float(p.get("cost", 0.0)),
float(p.get("salvage", 0.0)),
int(p.get("life", 10)),
int(p.get("start_year", pd.Timestamp.now().year)),
)
def build_cb(cost, salv, life, year):
try:
df = build_sl_schedule(float(cost), float(salv), int(life), int(year))
except Exception as e:
return pd.DataFrame([{"error": str(e)}])
return df
#def check_cb(cost, salv, life, year, table_state):
# # expected (numeric)
# exp = build_sl_schedule(float(cost), float(salv), int(life), int(year))
# exp["Year"] = pd.to_numeric(exp["Year"], errors="coerce")
# nothing to check?
# if not isinstance(table_state, pd.DataFrame) or table_state.empty:
# return pd.DataFrame(), "No student table found to check."
# 👇 Gradio returns strings → re-normalize and coerce here every time
# actual = _normalize_depr_columns(table_state)
# for c in ["Year", "Begin BV", "Depreciation", "Accum Dep", "End BV"]:
# actual[c] = pd.to_numeric(actual[c], errors="coerce")
# actual = actual.dropna(subset=["Year"]).reset_index(drop=True)
# deltas, msg = audit_against_expected(exp, actual)
# return deltas, msg
def check_cb(cost, salv, life, year, table_state):
exp = build_sl_schedule(float(cost), float(salv), int(life), int(year))
exp["Year"] = pd.to_numeric(exp["Year"], errors="coerce")
# Accept pd.DataFrame OR list-of-lists/dicts from Gradio
actual = table_state
if isinstance(actual, list):
# best effort columns; normalize later
actual = pd.DataFrame(actual)
elif not isinstance(actual, pd.DataFrame):
return pd.DataFrame(), "No student table found to check."
# normalize columns & numeric coercion
actual = _normalize_depr_columns(actual)
for c in ["Year","Begin BV","Depreciation","Accum Dep","End BV"]:
actual[c] = pd.to_numeric(actual[c], errors="coerce")
actual = actual.dropna(subset=["Year"]).reset_index(drop=True)
deltas, msg = audit_against_expected(exp, actual)
return deltas, msg
# --- Debug utilities ---
def debug_dump(ocr_text, params, raw_tbl, norm_tbl, last_tbl, image):
import pandas as pd, io
def df_summary(name, df):
if isinstance(df, pd.DataFrame) and not df.empty:
head = df.head(5).to_string(index=False)
return f"**{name}**: shape={df.shape}, cols={list(df.columns)}\n```\n{head}\n```"
return f"**{name}**: {type(df).__name__} (empty or not a DataFrame)"
lines = []
lines.append(f"**OCR text length**: {len(ocr_text or '')}")
lines.append(f"**Params keys**: {sorted(list((params or {}).keys()))}")
lines.append(df_summary("raw_df (Tab 2)", raw_tbl))
lines.append(df_summary("norm_df (Tab 2)", norm_tbl))
lines.append(df_summary("last_table (State)", last_tbl))
report = "\n\n".join(lines)
# Return the report and echoes of the DFs and image for visual confirmation
# (use empty DataFrames if inputs aren't DataFrames)
def ensure_df(x):
return x if isinstance(x, pd.DataFrame) else pd.DataFrame()
return (
report,
ensure_df(raw_tbl),
ensure_df(norm_tbl),
ensure_df(last_tbl),
image # echo the image
)
# ---------- UI ----------
with gr.Blocks(title="Jerry • HW Intake (Echo)") as demo:
last_params = gr.State({})
last_table = gr.State(pd.DataFrame())
gr.Markdown("## Jerry (TA) – Homework Intake\nThis Space **only reads and echoes** your files.\nNext step will add solving & coaching.")
# --- Tab 1: DOCX ---
with gr.Tab("Upload .docx"):
docx_in = gr.File(file_types=[".docx"], label="Homework .docx")
btn1 = gr.Button("Read")
header_txt = gr.Textbox(label="Header/Text (for params)", lines=8)
params_json = gr.JSON(label="Detected parameters")
table_df = gr.Dataframe(label="Detected table (normalized)", interactive=False)
# --- Tab 2: Image ---
with gr.Tab("Upload Image (.png/.jpg)"):
img_in = gr.Image(type="pil", label="Photo or screenshot of your table")
btn2 = gr.Button("OCR")
ocr_txt = gr.Textbox(label="Raw OCR text", lines=12)
params_json2 = gr.JSON(label="Detected parameters")
raw_df = gr.Dataframe(label="Raw table guess", interactive=False)
norm_df = gr.Dataframe(label="Detected table (normalized)", interactive=False)
# --- Tab 3: Solve & Check ---
with gr.Tab("Straight-Line • Solve & Check"):
gr.Markdown("Enter params (auto-filled if detected) → build the correct SL schedule → compare to your uploaded table.")
with gr.Row():
in_cost = gr.Number(label="Cost", value=0.0)
in_salv = gr.Number(label="Salvage", value=0.0)
in_life = gr.Number(label="Life (years)", value=10, precision=0)
in_year = gr.Number(label="Start year", value=2025, precision=0)
btn_use = gr.Button("Use detected params")
btn_build = gr.Button("Build expected schedule")
expected_df = gr.Dataframe(label="Expected (SL) schedule", interactive=False)
btn_check = gr.Button("Check against uploaded table")
deltas_df = gr.Dataframe(label="Differences (student − expected)", interactive=False)
coach_txt = gr.Markdown()
with gr.Tab("Debug"):
dbg_btn = gr.Button("Dump OCR state")
dbg_md = gr.Markdown()
dbg_raw = gr.Dataframe(label="raw_df echo", interactive=False)
dbg_norm = gr.Dataframe(label="norm_df echo", interactive=False)
dbg_last = gr.Dataframe(label="last_table (State) echo", interactive=False)
dbg_img = gr.Image(label="Image echo")
# ---------- Wire events AFTER all components exist ----------
btn1.click(
handle_docx,
inputs=docx_in,
outputs=[
header_txt, # text
params_json, # json
table_df, # normalized table (tab 1)
in_cost, in_salv, in_life, in_year, # autofill inputs
last_params, # state
last_table, # state
],
)
btn2.click(
handle_image,
inputs=img_in,
outputs=[
ocr_txt, # raw OCR text
params_json2, # json
raw_df, # raw table
norm_df, # normalized table (tab 2)
in_cost, in_salv, in_life, in_year, # autofill inputs
last_params, # state
last_table, # state
],
)
btn_build.click(build_cb, [in_cost, in_salv, in_life, in_year], [expected_df])
btn_check.click(check_cb, [in_cost, in_salv, in_life, in_year, last_table], [deltas_df, coach_txt])
dbg_btn.click(
debug_dump,
# inputs come from the components/state already populated by handle_image
inputs=[ocr_txt, params_json2, raw_df, norm_df, last_table, img_in],
outputs=[dbg_md, dbg_raw, dbg_norm, dbg_last, dbg_img],
)
gr.Markdown("— Echo mode finished. When this looks good, we’ll plug in the SL solver + coaching.")
if __name__ == "__main__":
demo.launch()