File size: 16,518 Bytes
37859cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0eff467
37859cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8e9b17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37859cc
01260a9
 
 
 
 
 
 
 
 
37859cc
 
01260a9
37859cc
 
 
01260a9
 
 
 
 
0eff467
01260a9
 
 
37859cc
0519f68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37859cc
 
409c1a5
0519f68
d8fdeae
a8e9b17
0519f68
409c1a5
 
 
 
0519f68
01260a9
0519f68
aac865c
409c1a5
8dcc7c1
409c1a5
 
 
 
68c5adb
aac865c
8b0f06f
d8fdeae
0eff467
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eeb7444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a3d32b
0eff467
b9c537c
6321be3
eeb7444
 
 
 
 
 
236f5b5
6321be3
eeb7444
 
 
9a3d32b
 
 
65de223
0eff467
37859cc
3257200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37859cc
 
a8e9b17
 
37859cc
0eff467
 
37859cc
 
 
 
 
 
 
0eff467
37859cc
 
 
 
 
 
d8fdeae
37859cc
0eff467
c4651e6
2a4b9b8
 
 
 
 
 
6965edc
0eff467
2a4b9b8
 
 
 
 
 
3257200
 
 
 
 
 
 
 
 
 
 
 
 
 
0eff467
 
 
 
 
 
 
 
 
 
 
 
 
c8cae3a
0eff467
 
 
 
 
 
 
9a3d32b
0eff467
 
 
 
 
c8cae3a
9a3d32b
 
 
 
3257200
 
 
 
 
 
 
 
37859cc
1e023d5
37859cc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re, io, os
import gradio as gr
import pandas as pd

# docx
try:
    from docx import Document
    DOCX_OK = True
except Exception:
    DOCX_OK = False

# ocr
try:
    from PIL import Image, ImageOps
    import pytesseract
    OCR_OK = True
except Exception:
    OCR_OK = False

# ---------- helpers ----------
def _norm_name(s: str) -> str:
    import re
    return re.sub(r"[^a-z0-9]", "", str(s).lower())

def _pick_col(cols, *cands):
    m = { _norm_name(c): c for c in cols }
    for cand in cands:
        for k, orig in m.items():
            if cand in k:
                return orig
    return None

def _coerce_numeric(x):
    if pd.isna(x): return x
    if isinstance(x, (int,float)): return float(x)
    s = str(x).replace(",","").replace("$","").strip()
    try: return float(s)
    except: return pd.NA

PARAM_PATTERNS = {
    "cost":       r"cost\s*[:=]\s*\$?\s*([\d,]+(?:\.\d+)?)",
    "salvage":    r"salvage\s*[:=]\s*\$?\s*([\d,]+(?:\.\d+)?)",
    "life":       r"(?:life|useful\s*life)\s*[:=]\s*([\d,]+)",
    "start_year": r"(?:start\s*year|start)\s*[:=]\s*([12]\d{3})",
}

def _extract_params(text: str):
    vals = {}
    low = (text or "").lower()
    for k, pat in PARAM_PATTERNS.items():
        m = re.search(pat, low, flags=re.I)
        if m:
            raw = m.group(1).replace(",", "")
            vals[k] = float(raw) if k in ("cost","salvage") else int(float(raw))
    return vals

def _docx_to_table_and_text(fileobj) -> tuple[pd.DataFrame|None, str]:
    if not DOCX_OK:
        return None, "(python-docx not available)"
    try:
        doc = Document(fileobj)
    except Exception as e:
        return None, f"[docx open failed] {e}"

    # collect paragraphs (for param scraping)
    all_text = "\n".join(p.text for p in doc.paragraphs)

    # try to find a depreciation table
    for t in doc.tables:
        rows = [[c.text.strip() for c in r.cells] for r in t.rows]
        if not rows:
            continue
        hdr = rows[0]
        if len(hdr) >= 4 and any("year" in _norm_name(h) for h in hdr):
            df = pd.DataFrame(rows[1:], columns=hdr)
            df = df[~(df.astype(str).apply(lambda r: "".join(r), axis=1).str.strip() == "")]
            if not df.empty:
                return df, all_text
    return None, all_text

def _image_to_text(img: Image.Image) -> str:
    if not OCR_OK:
        return "(pytesseract not available)"
    try:
        img = ImageOps.exif_transpose(img)
        gray = ImageOps.grayscale(img)
        return pytesseract.image_to_string(gray)
    except Exception as e:
        return f"[ocr failed] {e}"

def _table_from_ocr_text(text: str) -> pd.DataFrame|None:
    if not text or not text.strip():
        return None
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]

    hdr_i = -1
    for i, ln in enumerate(lines):
        low = ln.lower()
        if ("year" in low and "begin" in low and "dep" in low and "end" in low):
            hdr_i = i
            break
    if hdr_i == -1:
        for i, ln in enumerate(lines):
            parts = re.split(r"\s{2,}|\t+", ln)
            low = ln.lower()
            if len([p for p in parts if p.strip()]) >= 4 and any(k in low for k in ["year","begin","dep","end"]):
                hdr_i = i
                break
    if hdr_i == -1:
        return None

    header = [h.strip() for h in re.split(r"\s{2,}|\t+", lines[hdr_i]) if h.strip()]
    data = []
    for ln in lines[hdr_i+1:]:
        parts = [p.strip() for p in re.split(r"\s{2,}|\t+", ln) if p.strip()]
        if len(parts) == len(header):
            data.append(parts)
        else:
            if len(data) >= 1:
                break
    if not data:
        return None
    return pd.DataFrame(data, columns=header)

def _normalize_depr_columns(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    out = pd.DataFrame()
    c_year = _pick_col(df.columns, "year")
    c_beg  = _pick_col(df.columns, "beginbv","beginningbv","beginbook","begin","beginningvalue")
    c_dep  = _pick_col(df.columns, "depreciation","dep")
    c_acc  = _pick_col(df.columns, "accumdep","accumulateddep","accum","accdep")
    c_end  = _pick_col(df.columns, "endbv","endingbv","endbook","end","endingvalue")

    out["Year"] = df[c_year] if c_year else pd.NA
    out["Begin BV"] = df[c_beg] if c_beg else pd.NA
    out["Depreciation"] = df[c_dep] if c_dep else pd.NA
    out["Accum Dep"] = df[c_acc] if c_acc else pd.NA
    out["End BV"] = df[c_end] if c_end else pd.NA

    out["Year"] = pd.to_numeric(out["Year"], errors="coerce")
    for col in ["Begin BV","Depreciation","Accum Dep","End BV"]:
        out[col] = out[col].map(_coerce_numeric)
    out = out[~out[["Begin BV","Depreciation","Accum Dep","End BV"]].isna().all(axis=1)].reset_index(drop=True)
    return out

# Monday Aug 11 New helpers
def build_sl_schedule(cost: float, salvage: float, life: int, start_year: int):
    dep = (cost - salvage) / life
    years = [start_year + i for i in range(life)]
    begin_bv, dep_col, accum, end_bv = [], [], [], []
    b = cost
    acc = 0.0
    for _ in years:
        begin_bv.append(b)
        dep_col.append(dep)
        acc += dep
        accum.append(acc)
        b = b - dep
        end_bv.append(b)
    out = pd.DataFrame(
        {
            "Year": years,
            "Begin BV": begin_bv,
            "Depreciation": dep_col,
            "Accum Dep": accum,
            "End BV": end_bv,
        }
    )
    return out

def audit_against_expected(expected: pd.DataFrame, actual: pd.DataFrame):
    if actual is None or actual.empty:
        return pd.DataFrame(), "No student table found to check."
    merged = expected.merge(
        actual[["Year","Begin BV","Depreciation","Accum Dep","End BV"]],
        on="Year", how="inner", suffixes=("_exp","_act")
    )
    if merged.empty:
        return pd.DataFrame(), "No matching years between expected and uploaded table."
    deltas = pd.DataFrame({"Year": merged["Year"]})
    for c in ["Begin BV","Depreciation","Accum Dep","End BV"]:
        deltas[c + " Δ"] = merged[f"{c}_act"] - merged[f"{c}_exp"]
    first_bad = None
    for _, r in deltas.iterrows():
        if any(abs(r[col]) > 1e-6 for col in deltas.columns if col.endswith("Δ")):
            first_bad = int(r["Year"])
            break
    msg = (
        "All good 🎉 Straight‑line matches your table."
        if first_bad is None
        else f"First mismatch at year {first_bad}. Remember: Dep=(Cost−Salvage)/Life and Accum_t=Accum_(t−1)+Dep."
    )
    return deltas, msg

# ---------- Gradio callbacks ----------
def _params_tuple(p):
    p = p or {}
    return (
        float(p.get("cost", 0.0)),
        float(p.get("salvage", 0.0)),
        int(p.get("life", 10)),
        int(p.get("start_year", pd.Timestamp.now().year)),
    )

def handle_docx(file):
    if file is None:
        return "(no file)", {}, pd.DataFrame(), 0.0, 0.0, 10, pd.Timestamp.now().year, {}, pd.DataFrame()
    df_raw, header = _docx_to_table_and_text(file.name if hasattr(file, "name") else file)
    params = _extract_params(header or "")
    df_norm = _normalize_depr_columns(df_raw) if df_raw is not None else None
    cost, salv, life, year = _params_tuple(params)
    return (
        header or "(no text found)",
        params,
        (df_norm if df_norm is not None else pd.DataFrame()),
        cost, salv, life, year,
        params,
        (df_norm if df_norm is not None else pd.DataFrame()),
    )

#def handle_image(img):
#    if img is None:
#        return "(no image)", {}, None, pd.DataFrame(), 0.0, 0.0, 10, pd.Timestamp.now().year, {}, pd.DataFrame()
#    from PIL import Image as PILImage
#    pil = img if isinstance(img, PILImage.Image) else PILImage.fromarray(img)
#    ocr_text = _image_to_text(pil)
#    params = _extract_params(ocr_text or "")
#    df_raw = _table_from_ocr_text(ocr_text or "")
#    df_norm = _normalize_depr_columns(df_raw) if df_raw is not None else None
#    cost, salv, life, year = _params_tuple(params)
#    return (
#        ocr_text or "(empty OCR)",
#        params,
#        df_raw,
#        (df_norm if df_norm is not None else pd.DataFrame()),
#        cost, salv, life, year,
#       params,
#        (df_norm if df_norm is not None else pd.DataFrame()),
#   )

def handle_image(img):
    if img is None:
        return "(no image)", {}, pd.DataFrame(), pd.DataFrame(), 0.0, 0.0, 10, pd.Timestamp.now().year, {}, pd.DataFrame()

    from PIL import Image as PILImage
    pil = img if isinstance(img, PILImage.Image) else PILImage.fromarray(img)

    ocr_text = _image_to_text(pil)
    params = _extract_params(ocr_text or "")
    df_raw = _table_from_ocr_text(ocr_text or "")
    df_norm = _normalize_depr_columns(df_raw) if df_raw is not None else pd.DataFrame()

    cost, salv, life, year = _params_tuple(params)

    return (
        ocr_text or "(empty OCR)",
        params,
        df_raw,                  # raw table shown in OCR tab
        df_norm,                 # normalized table shown in OCR tab
        cost, salv, life, year,  # auto-fill numbers
        params,                  # save params state
        df_raw                  # 🔹 save normalized table to last_table (same as docx)
    )


def fill_from_state(p):
    p = p or {}
    return (
        float(p.get("cost", 0.0)),
        float(p.get("salvage", 0.0)),
        int(p.get("life", 10)),
        int(p.get("start_year", pd.Timestamp.now().year)),
    )

def build_cb(cost, salv, life, year):
    try:
        df = build_sl_schedule(float(cost), float(salv), int(life), int(year))
    except Exception as e:
        return pd.DataFrame([{"error": str(e)}])
    return df

#def check_cb(cost, salv, life, year, table_state):
#    # expected (numeric)
#    exp = build_sl_schedule(float(cost), float(salv), int(life), int(year))
#    exp["Year"] = pd.to_numeric(exp["Year"], errors="coerce")

    # nothing to check?
#    if not isinstance(table_state, pd.DataFrame) or table_state.empty:
#        return pd.DataFrame(), "No student table found to check."

    # 👇 Gradio returns strings → re-normalize and coerce here every time
#    actual = _normalize_depr_columns(table_state)
#    for c in ["Year", "Begin BV", "Depreciation", "Accum Dep", "End BV"]:
#        actual[c] = pd.to_numeric(actual[c], errors="coerce")
#    actual = actual.dropna(subset=["Year"]).reset_index(drop=True)

#    deltas, msg = audit_against_expected(exp, actual)
#    return deltas, msg

def check_cb(cost, salv, life, year, table_state):
    exp = build_sl_schedule(float(cost), float(salv), int(life), int(year))
    exp["Year"] = pd.to_numeric(exp["Year"], errors="coerce")

    # Accept pd.DataFrame OR list-of-lists/dicts from Gradio
    actual = table_state
    if isinstance(actual, list):
        # best effort columns; normalize later
        actual = pd.DataFrame(actual)
    elif not isinstance(actual, pd.DataFrame):
        return pd.DataFrame(), "No student table found to check."

    # normalize columns & numeric coercion
    actual = _normalize_depr_columns(actual)
    for c in ["Year","Begin BV","Depreciation","Accum Dep","End BV"]:
        actual[c] = pd.to_numeric(actual[c], errors="coerce")
    actual = actual.dropna(subset=["Year"]).reset_index(drop=True)

    deltas, msg = audit_against_expected(exp, actual)
    return deltas, msg

# --- Debug utilities ---
def debug_dump(ocr_text, params, raw_tbl, norm_tbl, last_tbl, image):
    import pandas as pd, io

    def df_summary(name, df):
        if isinstance(df, pd.DataFrame) and not df.empty:
            head = df.head(5).to_string(index=False)
            return f"**{name}**: shape={df.shape}, cols={list(df.columns)}\n```\n{head}\n```"
        return f"**{name}**: {type(df).__name__} (empty or not a DataFrame)"

    lines = []
    lines.append(f"**OCR text length**: {len(ocr_text or '')}")
    lines.append(f"**Params keys**: {sorted(list((params or {}).keys()))}")
    lines.append(df_summary("raw_df (Tab 2)", raw_tbl))
    lines.append(df_summary("norm_df (Tab 2)", norm_tbl))
    lines.append(df_summary("last_table (State)", last_tbl))
    report = "\n\n".join(lines)

    # Return the report and echoes of the DFs and image for visual confirmation
    # (use empty DataFrames if inputs aren't DataFrames)
    def ensure_df(x):
        return x if isinstance(x, pd.DataFrame) else pd.DataFrame()

    return (
        report,
        ensure_df(raw_tbl),
        ensure_df(norm_tbl),
        ensure_df(last_tbl),
        image  # echo the image
    )



# ---------- UI ----------
with gr.Blocks(title="Jerry • HW Intake (Echo)") as demo:
    last_params = gr.State({})
    last_table  = gr.State(pd.DataFrame())
    gr.Markdown("## Jerry (TA) – Homework Intake\nThis Space **only reads and echoes** your files.\nNext step will add solving & coaching.")

    # --- Tab 1: DOCX ---
    with gr.Tab("Upload .docx"):
        docx_in = gr.File(file_types=[".docx"], label="Homework .docx")
        btn1 = gr.Button("Read")
        header_txt = gr.Textbox(label="Header/Text (for params)", lines=8)
        params_json = gr.JSON(label="Detected parameters")
        table_df = gr.Dataframe(label="Detected table (normalized)", interactive=False)

    # --- Tab 2: Image ---
    with gr.Tab("Upload Image (.png/.jpg)"):
        img_in = gr.Image(type="pil", label="Photo or screenshot of your table")
        btn2 = gr.Button("OCR")
        ocr_txt = gr.Textbox(label="Raw OCR text", lines=12)
        params_json2 = gr.JSON(label="Detected parameters")
        raw_df = gr.Dataframe(label="Raw table guess", interactive=False)
        norm_df = gr.Dataframe(label="Detected table (normalized)", interactive=False)

    # --- Tab 3: Solve & Check ---
    with gr.Tab("Straight-Line • Solve & Check"):
        gr.Markdown("Enter params (auto-filled if detected) → build the correct SL schedule → compare to your uploaded table.")
        with gr.Row():
            in_cost = gr.Number(label="Cost", value=0.0)
            in_salv = gr.Number(label="Salvage", value=0.0)
            in_life = gr.Number(label="Life (years)", value=10, precision=0)
            in_year = gr.Number(label="Start year", value=2025, precision=0)

        btn_use = gr.Button("Use detected params")
        btn_build = gr.Button("Build expected schedule")
        expected_df = gr.Dataframe(label="Expected (SL) schedule", interactive=False)
        btn_check = gr.Button("Check against uploaded table")
        deltas_df  = gr.Dataframe(label="Differences (student − expected)", interactive=False)
        coach_txt  = gr.Markdown()

    with gr.Tab("Debug"):
        dbg_btn  = gr.Button("Dump OCR state")
        dbg_md   = gr.Markdown()
        dbg_raw  = gr.Dataframe(label="raw_df echo", interactive=False)
        dbg_norm = gr.Dataframe(label="norm_df echo", interactive=False)
        dbg_last = gr.Dataframe(label="last_table (State) echo", interactive=False)
        dbg_img  = gr.Image(label="Image echo")

    
    



    
    # ---------- Wire events AFTER all components exist ----------
    btn1.click(
        handle_docx,
        inputs=docx_in,
        outputs=[
            header_txt,        # text
            params_json,       # json
            table_df,          # normalized table (tab 1)
            in_cost, in_salv, in_life, in_year,   # autofill inputs
            last_params,       # state
            last_table,        # state
        ],
    )

    btn2.click(
        handle_image,
        inputs=img_in,
        outputs=[
            ocr_txt,           # raw OCR text
            params_json2,      # json
            raw_df,            # raw table
            norm_df,           # normalized table (tab 2)
            in_cost, in_salv, in_life, in_year,   # autofill inputs
            last_params,       # state
            last_table,        # state
        ],
    )


    btn_build.click(build_cb, [in_cost, in_salv, in_life, in_year], [expected_df])
    btn_check.click(check_cb, [in_cost, in_salv, in_life, in_year, last_table], [deltas_df, coach_txt])


    dbg_btn.click(
        debug_dump,
        # inputs come from the components/state already populated by handle_image
        inputs=[ocr_txt, params_json2, raw_df, norm_df, last_table, img_in],
        outputs=[dbg_md, dbg_raw, dbg_norm, dbg_last, dbg_img],
    )
    
    gr.Markdown("— Echo mode finished. When this looks good, we’ll plug in the SL solver + coaching.")
 
if __name__ == "__main__":
    demo.launch()