""" ValuationAI β€” Nairobi Valuation Sheet OCR Model: rasmodev/Handwriting_trocr_model """ import io, time, logging, tempfile, os import streamlit as st import pandas as pd from PIL import Image st.set_page_config( page_title="ValuationAI", page_icon="πŸ“‹", layout="wide", initial_sidebar_state="collapsed", ) logging.basicConfig(level=logging.INFO) st.markdown(""" """, unsafe_allow_html=True) # ═══════════════════════════════════════════════════════════ # MODEL # ═══════════════════════════════════════════════════════════ @st.cache_resource(show_spinner="Loading recognition model…") def load_model(): import torch from transformers import TrOCRProcessor, VisionEncoderDecoderModel MODEL_ID = "rasmodev/Handwriting_trocr_model" BASE_ID = "microsoft/trocr-base-handwritten" # Load processor from base model β€” has all required config files # Load weights from fine-tuned model β€” contains trained parameters processor = TrOCRProcessor.from_pretrained(BASE_ID) model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device).eval() return processor, model, device # ═══════════════════════════════════════════════════════════ # OCR # ═══════════════════════════════════════════════════════════ def ocr_page(img: Image.Image) -> str: import torch processor, model, device = load_model() pixel_values = processor( images=img.convert("RGB"), return_tensors="pt" ).pixel_values.to(device) with torch.no_grad(): generated = model.generate( pixel_values=pixel_values, max_new_tokens=64, num_beams=1, ) return processor.batch_decode(generated, skip_special_tokens=True)[0].strip() # ═══════════════════════════════════════════════════════════ # PARSE LABEL # Format: PLOT: ... | LOC: ... | AREA: ... | AMT: ... | DATE: ... | VOS: ... # ═══════════════════════════════════════════════════════════ def parse_label(raw_text: str, filename: str) -> dict: record = { "File": filename, "Plot Number": "", "Location": "", "Area": "", "Amount (KES)": None, "Date": "", "VOS": "", "Raw Output": raw_text, } for part in raw_text.split("|"): part = part.strip() if ":" not in part: continue key, _, val = part.partition(":") key = key.strip().upper() val = val.strip() if key == "PLOT": record["Plot Number"] = val elif key == "LOC": record["Location"] = val elif key == "AREA": record["Area"] = val elif key == "AMT": try: record["Amount (KES)"] = int(val.replace(",", "").replace(" ", "")) except ValueError: record["Amount (KES)"] = val elif key == "DATE": record["Date"] = val elif key == "VOS": record["VOS"] = val return record # ═══════════════════════════════════════════════════════════ # EXCEL EXPORT # ═══════════════════════════════════════════════════════════ def make_excel(records: list) -> bytes: from openpyxl import load_workbook from openpyxl.styles import Font, PatternFill, Alignment from openpyxl.utils import get_column_letter clean = [{k: v for k, v in r.items() if k != "Raw Output"} for r in records] buf = io.BytesIO() pd.DataFrame(clean).to_excel(buf, index=False, sheet_name="Valuation Data") buf.seek(0) wb = load_workbook(buf) ws = wb.active hdr = PatternFill("solid", start_color="1A1A2E") for ci, cell in enumerate(ws[1], 1): cell.font = Font(name="Calibri", bold=True, color="FFFFFF", size=11) cell.fill = hdr cell.alignment = Alignment(horizontal="center", vertical="center") ws.column_dimensions[get_column_letter(ci)].width = 26 ws.row_dimensions[1].height = 30 for row in ws.iter_rows(min_row=2): for cell in row: cell.alignment = Alignment(vertical="center", wrap_text=True) if cell.row % 2 == 0: cell.fill = PatternFill("solid", start_color="F0F4FF") ws.freeze_panes = "A2" out = io.BytesIO() wb.save(out) return out.getvalue() # ═══════════════════════════════════════════════════════════ # SESSION STATE # ═══════════════════════════════════════════════════════════ for k, v in [("records",[]),("excel",None),("done",False),("errors",[])]: if k not in st.session_state: st.session_state[k] = v # ═══════════════════════════════════════════════════════════ # UI # ═══════════════════════════════════════════════════════════ st.markdown("""
Nairobi City County β€” Document Intelligence
Recognition model
rasmodev/Handwriting_trocr_model
""", unsafe_allow_html=True) st.markdown("""
Digitise handwritten
valuation sheets instantly.
Upload one or more scanned PDF valuation sheets. The system reads every handwritten field and delivers a structured Excel file β€” ready for records management.
""", unsafe_allow_html=True) st.markdown('
Step 1 β€” Upload Documents
', unsafe_allow_html=True) uploaded = st.file_uploader( "Drag and drop valuation sheet PDFs here, or click to browse", type=["pdf", "png", "jpg", "jpeg", "tiff", "bmp"], accept_multiple_files=True, label_visibility="collapsed", ) if uploaded: chips = "".join( f'πŸ“„ {f.name[:35]}{"…" if len(f.name)>35 else ""}' for f in uploaded ) st.markdown(f'
{chips}
', unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) st.markdown('
Step 2 β€” Extract & Download
', unsafe_allow_html=True) run = st.button( "Extract Data from Documents", disabled=not uploaded, use_container_width=True, ) # ═══════════════════════════════════════════════════════════ # PROCESSING # ═══════════════════════════════════════════════════════════ if run and uploaded: import fitz, traceback st.session_state.records = [] st.session_state.errors = [] st.session_state.done = False bar = st.progress(0.0) status = st.empty() t0 = time.time() for fi, uf in enumerate(uploaded): fname = uf.name raw = uf.read() bar.progress(fi / len(uploaded), text=f"Reading {fname}…") st.write(f"πŸ“„ **{fname}** β€” {len(raw):,} bytes") try: ext = fname.lower().rsplit(".", 1)[-1] if ext == "pdf": # Write to temp file β€” same as training with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: tmp.write(raw) tmp_path = tmp.name doc = fitz.open(tmp_path) st.write(f" βœ… PDF opened β€” {len(doc)} page(s) found") imgs = [] mat = fitz.Matrix(200/72, 200/72) for page in doc: pix = page.get_pixmap(matrix=mat, alpha=False) img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") imgs.append(img) pix = None doc.close() os.unlink(tmp_path) st.write(f" βœ… Rasterized {len(imgs)} page image(s)") else: imgs = [Image.open(io.BytesIO(raw)).convert("RGB")] st.write(f" βœ… Loaded image") if not imgs: st.error(f" ❌ No pages extracted from {fname}") st.session_state.errors.append(f"{fname}: no pages extracted") continue for pi, img in enumerate(imgs, 1): status.caption(f"Running OCR on **{fname}** β€” page {pi} of {len(imgs)}") raw_text = ocr_page(img) st.write(f" πŸ“ Page {pi} OCR output: `{raw_text}`") record = parse_label(raw_text, fname) st.session_state.records.append(record) except Exception as e: st.error(f"❌ Error on {fname}: {e}") st.code(traceback.format_exc()) st.session_state.errors.append(f"{fname}: {e}") bar.progress((fi + 1) / len(uploaded)) bar.empty() status.empty() if st.session_state.records: st.session_state.excel = make_excel(st.session_state.records) st.session_state.done = True elapsed = time.time() - t0 st.success( f"Processed {len(st.session_state.records)} page(s) " f"from {len(uploaded)} document(s) in {elapsed:.1f}s." ) # ═══════════════════════════════════════════════════════════ # RESULTS # ═══════════════════════════════════════════════════════════ if st.session_state.done and st.session_state.records: records = st.session_state.records df = pd.DataFrame(records) display_cols = [c for c in df.columns if c != "Raw Output"] df_display = df[display_cols] n_plots = df["Plot Number"].astype(bool).sum() n_amounts = pd.to_numeric(df["Amount (KES)"], errors="coerce").notna().sum() n_dates = df["Date"].astype(bool).sum() st.markdown(f"""
{len(records)}
Pages processed
{n_plots}
Plot numbers
{n_amounts}
Amounts extracted
{n_dates}
Dates captured
""", unsafe_allow_html=True) col_t, col_d = st.columns([5, 1]) with col_t: st.markdown('
Extracted Records
', unsafe_allow_html=True) with col_d: st.markdown('
', unsafe_allow_html=True) if st.session_state.excel: st.download_button( "⬇ Export Excel", data=st.session_state.excel, file_name="valuation_records.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ) st.dataframe(df_display, use_container_width=True, height=min(80 + len(df)*38, 560), hide_index=True) with st.expander("πŸ” Raw model output (for verification)"): for r in records: st.markdown( f'
' f'{r["File"]}
{r.get("Raw Output","")}
', unsafe_allow_html=True, ) if st.session_state.errors: with st.expander(f"⚠ {len(st.session_state.errors)} file(s) could not be processed"): for e in st.session_state.errors: st.caption(e)