rasmodev commited on
Commit
2d0dda1
Β·
verified Β·
1 Parent(s): a830411

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -104
app.py CHANGED
@@ -2,10 +2,20 @@
2
  ValuationAI β€” Nairobi Valuation Sheet OCR
3
  Model: rasmodev/Handwriting_trocr_model
4
 
 
 
 
 
 
 
 
 
 
 
5
  Label format from training:
6
- PLOT: LR 209/617 | LOC: STATE HOUSE AVENUE | AREA: 0.06 | AMT: 52000000 | DATE: 2008-06-17 | VOS: 3872
7
  """
8
- import io, time, logging
9
  import streamlit as st
10
  import pandas as pd
11
  from PIL import Image
@@ -33,7 +43,6 @@ html, body, [class*="css"], .stApp {
33
  }
34
  #MainMenu, footer, header { visibility: hidden; }
35
 
36
- /* Top bar */
37
  .topbar {
38
  display: flex; align-items: flex-end;
39
  justify-content: space-between;
@@ -46,15 +55,12 @@ html, body, [class*="css"], .stApp {
46
  .model-ref { font-size: 0.7rem; color: #9CA3AF; font-weight: 400; letter-spacing: 0.04em; text-align: right; }
47
  .model-ref strong { color: #2563EB; font-weight: 600; }
48
 
49
- /* Headline */
50
  .headline { font-family: 'Cormorant Garamond', serif; font-size: 3.4rem; font-weight: 700; line-height: 1.08; letter-spacing: -0.03em; color: #1A1A2E; margin-bottom: 1rem; max-width: 700px; }
51
  .headline em { font-style: italic; color: #2563EB; }
52
  .subline { font-size: 0.95rem; font-weight: 300; color: #6B7280; line-height: 1.7; max-width: 500px; margin-bottom: 3rem; }
53
 
54
- /* Step label */
55
  .step { font-size: 0.65rem; font-weight: 700; letter-spacing: 0.18em; text-transform: uppercase; color: #2563EB; margin-bottom: 0.5rem; }
56
 
57
- /* File uploader */
58
  [data-testid="stFileUploader"] section {
59
  background: #fff !important;
60
  border: 2px dashed #D1D5DB !important;
@@ -69,10 +75,8 @@ html, body, [class*="css"], .stApp {
69
  }
70
  [data-testid="stFileUploader"] label { color: #6B7280 !important; font-size: 0.9rem !important; }
71
 
72
- /* File chip */
73
  .fchip { display: inline-flex; align-items: center; gap: 5px; background: #EFF6FF; border: 1px solid #BFDBFE; color: #1D4ED8; padding: 0.25rem 0.7rem; border-radius: 6px; font-size: 0.73rem; font-weight: 500; margin: 2px; }
74
 
75
- /* Button */
76
  .stButton > button {
77
  background: #1A1A2E !important; color: #fff !important; border: none !important;
78
  border-radius: 8px !important; padding: 0.85rem 2.5rem !important;
@@ -84,22 +88,18 @@ html, body, [class*="css"], .stApp {
84
  .stButton > button:hover { background: #2563EB !important; box-shadow: 0 4px 16px rgba(37,99,235,0.3) !important; transform: translateY(-1px) !important; }
85
  .stButton > button:disabled { background: #E5E7EB !important; color: #9CA3AF !important; box-shadow: none !important; transform: none !important; }
86
 
87
- /* Progress */
88
  .stProgress > div > div > div { background: #2563EB !important; border-radius: 4px !important; }
89
  .stProgress > div > div { background: #E5E7EB !important; border-radius: 4px !important; height: 4px !important; }
90
 
91
- /* Stats */
92
  .stats-strip { display: flex; background: #1A1A2E; border-radius: 12px; overflow: hidden; margin: 2.5rem 0 2rem; }
93
  .stat-item { flex: 1; padding: 1.6rem 2rem; border-right: 1px solid rgba(255,255,255,0.08); }
94
  .stat-item:last-child { border-right: none; }
95
  .stat-n { font-family: 'Cormorant Garamond', serif; font-size: 2.6rem; font-weight: 700; color: #fff; line-height: 1; margin-bottom: 0.3rem; }
96
  .stat-l { font-size: 0.68rem; font-weight: 500; letter-spacing: 0.12em; text-transform: uppercase; color: #6B7280; }
97
 
98
- /* Section head */
99
  .section-head { display: flex; align-items: center; justify-content: space-between; margin-bottom: 1rem; padding-bottom: 0.75rem; border-bottom: 1px solid #E5E7EB; }
100
  .section-title { font-family: 'Cormorant Garamond', serif; font-size: 1.5rem; font-weight: 600; color: #1A1A2E; letter-spacing: -0.01em; }
101
 
102
- /* Download button */
103
  div[data-testid="stDownloadButton"] > button {
104
  background: #fff !important; border: 1.5px solid #1A1A2E !important; color: #1A1A2E !important;
105
  border-radius: 8px !important; padding: 0.6rem 1.4rem !important;
@@ -109,14 +109,13 @@ div[data-testid="stDownloadButton"] > button {
109
  }
110
  div[data-testid="stDownloadButton"] > button:hover { background: #1A1A2E !important; color: #fff !important; }
111
 
112
- /* Dataframe */
113
  [data-testid="stDataFrame"] { border-radius: 10px !important; border: 1px solid #E5E7EB !important; overflow: hidden !important; box-shadow: 0 1px 4px rgba(0,0,0,0.05) !important; }
114
  </style>
115
  """, unsafe_allow_html=True)
116
 
117
 
118
  # ═══════════════════════════════════════════════════════════
119
- # MODEL LOADING
120
  # ═══════════════════════════════════════════════════════════
121
  @st.cache_resource(show_spinner="Loading recognition model…")
122
  def load_model():
@@ -131,28 +130,59 @@ def load_model():
131
 
132
 
133
  # ═══════════════════════════════════════════════════════════
134
- # OCR β€” matches training output format exactly
135
- # Training label format:
136
- # PLOT: LR 209/617 | LOC: STATE HOUSE AVENUE | AREA: 0.06 | AMT: 52000000 | DATE: 2008-06-17 | VOS: 3872
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  # ═══════════════════════════════════════════════════════════
138
  def ocr_page(img: Image.Image) -> str:
139
- """Run the fine-tuned model on one page image."""
140
  import torch
141
  processor, model, device = load_model()
142
- pv = processor(
 
143
  images=img.convert("RGB"),
144
  return_tensors="pt"
145
  ).pixel_values.to(device)
 
146
  with torch.no_grad():
147
- gen = model.generate(pv, max_new_tokens=128)
148
- return processor.batch_decode(gen, skip_special_tokens=True)[0].strip()
 
 
 
 
149
 
150
 
 
 
 
 
151
  def parse_label(raw_text: str, filename: str) -> dict:
152
- """
153
- Parse the pipe-delimited label that the model was trained to output.
154
- Format: PLOT: ... | LOC: ... | AREA: ... | AMT: ... | DATE: ... | VOS: ...
155
- """
156
  record = {
157
  "File": filename,
158
  "Plot Number": "",
@@ -163,17 +193,13 @@ def parse_label(raw_text: str, filename: str) -> dict:
163
  "VOS": "",
164
  "Raw Output": raw_text,
165
  }
166
-
167
- # Split on pipe delimiter
168
- parts = raw_text.split("|")
169
- for part in parts:
170
  part = part.strip()
171
  if ":" not in part:
172
  continue
173
  key, _, val = part.partition(":")
174
  key = key.strip().upper()
175
  val = val.strip()
176
-
177
  if key == "PLOT":
178
  record["Plot Number"] = val
179
  elif key == "LOC":
@@ -181,7 +207,6 @@ def parse_label(raw_text: str, filename: str) -> dict:
181
  elif key == "AREA":
182
  record["Area"] = val
183
  elif key == "AMT":
184
- # Remove commas and convert to int
185
  try:
186
  record["Amount (KES)"] = int(val.replace(",", "").replace(" ", ""))
187
  except ValueError:
@@ -190,54 +215,34 @@ def parse_label(raw_text: str, filename: str) -> dict:
190
  record["Date"] = val
191
  elif key == "VOS":
192
  record["VOS"] = val
193
-
194
  return record
195
 
196
 
197
- def pdf_to_images(file_bytes: bytes) -> list:
198
- """Convert all pages of a PDF to PIL images at 200 DPI."""
199
- import fitz
200
- doc = fitz.open(stream=file_bytes, filetype="pdf")
201
- mat = fitz.Matrix(200/72, 200/72)
202
- imgs = []
203
- for i in range(len(doc)):
204
- pix = doc[i].get_pixmap(matrix=mat, alpha=False)
205
- imgs.append(Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB"))
206
- doc.close()
207
- return imgs
208
-
209
-
210
  def make_excel(records: list) -> bytes:
211
- """Export records to a formatted Excel workbook."""
212
  from openpyxl import load_workbook
213
  from openpyxl.styles import Font, PatternFill, Alignment
214
  from openpyxl.utils import get_column_letter
215
-
216
- # Drop raw output from Excel β€” it's only for debugging
217
  clean = [{k: v for k, v in r.items() if k != "Raw Output"} for r in records]
218
-
219
  buf = io.BytesIO()
220
  pd.DataFrame(clean).to_excel(buf, index=False, sheet_name="Valuation Data")
221
  buf.seek(0)
222
-
223
  wb = load_workbook(buf)
224
  ws = wb.active
225
  hdr = PatternFill("solid", start_color="1A1A2E")
226
-
227
  for ci, cell in enumerate(ws[1], 1):
228
  cell.font = Font(name="Calibri", bold=True, color="FFFFFF", size=11)
229
  cell.fill = hdr
230
  cell.alignment = Alignment(horizontal="center", vertical="center")
231
  ws.column_dimensions[get_column_letter(ci)].width = 26
232
-
233
  ws.row_dimensions[1].height = 30
234
-
235
  for row in ws.iter_rows(min_row=2):
236
  for cell in row:
237
  cell.alignment = Alignment(vertical="center", wrap_text=True)
238
  if cell.row % 2 == 0:
239
  cell.fill = PatternFill("solid", start_color="F0F4FF")
240
-
241
  ws.freeze_panes = "A2"
242
  out = io.BytesIO()
243
  wb.save(out)
@@ -253,7 +258,7 @@ for k, v in [("records",[]),("excel",None),("done",False),("errors",[])]:
253
 
254
 
255
  # ═══════════════════════════════════════════════════════════
256
- # UI β€” TOP BAR
257
  # ═══════════════════════════════════════════════════════════
258
  st.markdown("""
259
  <div class="topbar">
@@ -268,10 +273,6 @@ st.markdown("""
268
  </div>
269
  """, unsafe_allow_html=True)
270
 
271
-
272
- # ═══════════════════════════════════════════════════════════
273
- # UI β€” HEADLINE
274
- # ═══════════════════════════════════════════════════════════
275
  st.markdown("""
276
  <div class="headline">
277
  Digitise handwritten<br>valuation sheets <em>instantly.</em>
@@ -283,10 +284,6 @@ st.markdown("""
283
  </div>
284
  """, unsafe_allow_html=True)
285
 
286
-
287
- # ═══════════════════════════════════════════════════════════
288
- # UI β€” UPLOAD
289
- # ═══════════════════════════════════════════════════════════
290
  st.markdown('<div class="step">Step 1 β€” Upload Documents</div>', unsafe_allow_html=True)
291
 
292
  uploaded = st.file_uploader(
@@ -333,17 +330,18 @@ if run and uploaded:
333
  try:
334
  ext = fname.lower().rsplit(".", 1)[-1]
335
 
336
- # Get page images
337
  if ext == "pdf":
338
  imgs = pdf_to_images(raw)
339
  else:
340
  imgs = [Image.open(io.BytesIO(raw)).convert("RGB")]
341
 
342
- # Run OCR on each page
 
 
 
343
  for pi, img in enumerate(imgs, 1):
344
  status.caption(
345
- f"Running recognition on **{fname}** β€” "
346
- f"page {pi} of {len(imgs)}"
347
  )
348
  raw_text = ocr_page(img)
349
  record = parse_label(raw_text, fname)
@@ -374,45 +372,25 @@ if run and uploaded:
374
  if st.session_state.done and st.session_state.records:
375
  records = st.session_state.records
376
  df = pd.DataFrame(records)
377
-
378
- # Display columns β€” exclude raw output from table
379
  display_cols = [c for c in df.columns if c != "Raw Output"]
380
  df_display = df[display_cols]
381
 
382
- # Stats
383
  n_plots = df["Plot Number"].astype(bool).sum()
384
  n_amounts = pd.to_numeric(df["Amount (KES)"], errors="coerce").notna().sum()
385
  n_dates = df["Date"].astype(bool).sum()
386
 
387
  st.markdown(f"""
388
  <div class="stats-strip">
389
- <div class="stat-item">
390
- <div class="stat-n">{len(records)}</div>
391
- <div class="stat-l">Pages processed</div>
392
- </div>
393
- <div class="stat-item">
394
- <div class="stat-n">{n_plots}</div>
395
- <div class="stat-l">Plot numbers</div>
396
- </div>
397
- <div class="stat-item">
398
- <div class="stat-n">{n_amounts}</div>
399
- <div class="stat-l">Amounts extracted</div>
400
- </div>
401
- <div class="stat-item">
402
- <div class="stat-n">{n_dates}</div>
403
- <div class="stat-l">Dates captured</div>
404
- </div>
405
  </div>
406
  """, unsafe_allow_html=True)
407
 
408
- # Table header + download side by side
409
  col_t, col_d = st.columns([5, 1])
410
  with col_t:
411
- st.markdown("""
412
- <div class="section-head">
413
- <div class="section-title">Extracted Records</div>
414
- </div>
415
- """, unsafe_allow_html=True)
416
  with col_d:
417
  st.markdown('<div style="padding-top:0.3rem"></div>', unsafe_allow_html=True)
418
  if st.session_state.excel:
@@ -423,16 +401,10 @@ if st.session_state.done and st.session_state.records:
423
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
424
  )
425
 
426
- # Dataframe
427
- st.dataframe(
428
- df_display,
429
- use_container_width=True,
430
- height=min(80 + len(df) * 38, 560),
431
- hide_index=True,
432
- )
433
 
434
- # Raw OCR output β€” for verification/debugging
435
- with st.expander("πŸ” View raw model output (for verification)"):
436
  for r in records:
437
  st.markdown(
438
  f'<div style="font-family:monospace;font-size:0.78rem;'
@@ -441,8 +413,7 @@ if st.session_state.done and st.session_state.records:
441
  unsafe_allow_html=True,
442
  )
443
 
444
- # Errors
445
  if st.session_state.errors:
446
- with st.expander(f"⚠ {len(st.session_state.errors)} file(s) could not be processed"):
447
  for e in st.session_state.errors:
448
- st.caption(e)
 
2
  ValuationAI β€” Nairobi Valuation Sheet OCR
3
  Model: rasmodev/Handwriting_trocr_model
4
 
5
+ PDF processing matches notebook exactly:
6
+ - fitz opened via temp file (not stream) matching how training data was built
7
+ - Matrix(200/72, 200/72) β€” same DPI as training
8
+ - get_pixmap(matrix=mat, alpha=False) β€” same as training
9
+ - Image.open(...).convert('RGB') β€” same as training
10
+
11
+ Inference matches notebook exactly:
12
+ - processor(images=img.convert('RGB'), return_tensors='pt').pixel_values
13
+ - model.generate(pixel_values=pv, max_new_tokens=64, num_beams=1)
14
+
15
  Label format from training:
16
+ - PLOT: LR 209/617 | LOC: STATE HOUSE AVENUE | AREA: 0.06 | AMT: 52000000 | DATE: 2008-06-17 | VOS: 3872
17
  """
18
+ import io, time, logging, tempfile, os
19
  import streamlit as st
20
  import pandas as pd
21
  from PIL import Image
 
43
  }
44
  #MainMenu, footer, header { visibility: hidden; }
45
 
 
46
  .topbar {
47
  display: flex; align-items: flex-end;
48
  justify-content: space-between;
 
55
  .model-ref { font-size: 0.7rem; color: #9CA3AF; font-weight: 400; letter-spacing: 0.04em; text-align: right; }
56
  .model-ref strong { color: #2563EB; font-weight: 600; }
57
 
 
58
  .headline { font-family: 'Cormorant Garamond', serif; font-size: 3.4rem; font-weight: 700; line-height: 1.08; letter-spacing: -0.03em; color: #1A1A2E; margin-bottom: 1rem; max-width: 700px; }
59
  .headline em { font-style: italic; color: #2563EB; }
60
  .subline { font-size: 0.95rem; font-weight: 300; color: #6B7280; line-height: 1.7; max-width: 500px; margin-bottom: 3rem; }
61
 
 
62
  .step { font-size: 0.65rem; font-weight: 700; letter-spacing: 0.18em; text-transform: uppercase; color: #2563EB; margin-bottom: 0.5rem; }
63
 
 
64
  [data-testid="stFileUploader"] section {
65
  background: #fff !important;
66
  border: 2px dashed #D1D5DB !important;
 
75
  }
76
  [data-testid="stFileUploader"] label { color: #6B7280 !important; font-size: 0.9rem !important; }
77
 
 
78
  .fchip { display: inline-flex; align-items: center; gap: 5px; background: #EFF6FF; border: 1px solid #BFDBFE; color: #1D4ED8; padding: 0.25rem 0.7rem; border-radius: 6px; font-size: 0.73rem; font-weight: 500; margin: 2px; }
79
 
 
80
  .stButton > button {
81
  background: #1A1A2E !important; color: #fff !important; border: none !important;
82
  border-radius: 8px !important; padding: 0.85rem 2.5rem !important;
 
88
  .stButton > button:hover { background: #2563EB !important; box-shadow: 0 4px 16px rgba(37,99,235,0.3) !important; transform: translateY(-1px) !important; }
89
  .stButton > button:disabled { background: #E5E7EB !important; color: #9CA3AF !important; box-shadow: none !important; transform: none !important; }
90
 
 
91
  .stProgress > div > div > div { background: #2563EB !important; border-radius: 4px !important; }
92
  .stProgress > div > div { background: #E5E7EB !important; border-radius: 4px !important; height: 4px !important; }
93
 
 
94
  .stats-strip { display: flex; background: #1A1A2E; border-radius: 12px; overflow: hidden; margin: 2.5rem 0 2rem; }
95
  .stat-item { flex: 1; padding: 1.6rem 2rem; border-right: 1px solid rgba(255,255,255,0.08); }
96
  .stat-item:last-child { border-right: none; }
97
  .stat-n { font-family: 'Cormorant Garamond', serif; font-size: 2.6rem; font-weight: 700; color: #fff; line-height: 1; margin-bottom: 0.3rem; }
98
  .stat-l { font-size: 0.68rem; font-weight: 500; letter-spacing: 0.12em; text-transform: uppercase; color: #6B7280; }
99
 
 
100
  .section-head { display: flex; align-items: center; justify-content: space-between; margin-bottom: 1rem; padding-bottom: 0.75rem; border-bottom: 1px solid #E5E7EB; }
101
  .section-title { font-family: 'Cormorant Garamond', serif; font-size: 1.5rem; font-weight: 600; color: #1A1A2E; letter-spacing: -0.01em; }
102
 
 
103
  div[data-testid="stDownloadButton"] > button {
104
  background: #fff !important; border: 1.5px solid #1A1A2E !important; color: #1A1A2E !important;
105
  border-radius: 8px !important; padding: 0.6rem 1.4rem !important;
 
109
  }
110
  div[data-testid="stDownloadButton"] > button:hover { background: #1A1A2E !important; color: #fff !important; }
111
 
 
112
  [data-testid="stDataFrame"] { border-radius: 10px !important; border: 1px solid #E5E7EB !important; overflow: hidden !important; box-shadow: 0 1px 4px rgba(0,0,0,0.05) !important; }
113
  </style>
114
  """, unsafe_allow_html=True)
115
 
116
 
117
  # ═══════════════════════════════════════════════════════════
118
+ # MODEL β€” matches notebook Cell 13 + Cell 28
119
  # ═══════════════════════════════════════════════════════════
120
  @st.cache_resource(show_spinner="Loading recognition model…")
121
  def load_model():
 
130
 
131
 
132
  # ═══════════════════════════════════════════════════════════
133
+ # PDF β†’ IMAGES β€” matches notebook Cell 10 exactly
134
+ # Uses temp file not stream β€” same as training
135
+ # Matrix(200/72, 200/72), get_pixmap(alpha=False), convert('RGB')
136
+ # ═══════════════════════════════════════════════════════════
137
+ def pdf_to_images(file_bytes: bytes) -> list:
138
+ import fitz
139
+ images = []
140
+ # Write to temp file β€” same as training which used file paths
141
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
142
+ tmp.write(file_bytes)
143
+ tmp_path = tmp.name
144
+ try:
145
+ doc = fitz.open(tmp_path) # open from path like training
146
+ mat = fitz.Matrix(200/72, 200/72) # same DPI as training
147
+ for page in doc:
148
+ pix = page.get_pixmap(matrix=mat, alpha=False) # same as training
149
+ img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") # same as training
150
+ images.append(img)
151
+ pix = None # free memory immediately like training
152
+ doc.close()
153
+ finally:
154
+ os.unlink(tmp_path)
155
+ return images
156
+
157
+
158
+ # ═══════════════════════════════════════════════════════════
159
+ # OCR β€” matches notebook Cell 18 + Cell 20 inference
160
+ # processor(images=img.convert('RGB')) then model.generate
161
+ # max_new_tokens=64, num_beams=1 (greedy β€” fast)
162
  # ═══════════════════════════════════════════════════════════
163
  def ocr_page(img: Image.Image) -> str:
 
164
  import torch
165
  processor, model, device = load_model()
166
+ # Exactly as in ValuationDataset.__getitem__
167
+ pixel_values = processor(
168
  images=img.convert("RGB"),
169
  return_tensors="pt"
170
  ).pixel_values.to(device)
171
+
172
  with torch.no_grad():
173
+ generated = model.generate(
174
+ pixel_values=pixel_values,
175
+ max_new_tokens=64,
176
+ num_beams=1, # greedy β€” fast, matches validation in notebook
177
+ )
178
+ return processor.batch_decode(generated, skip_special_tokens=True)[0].strip()
179
 
180
 
181
+ # ═══════════════════════════════════════════════════════════
182
+ # PARSE LABEL β€” matches row_to_label() from notebook Cell 10
183
+ # Format: PLOT: ... | LOC: ... | AREA: ... | AMT: ... | DATE: ... | VOS: ...
184
+ # ═══════════════════════════════════════════════════════════
185
  def parse_label(raw_text: str, filename: str) -> dict:
 
 
 
 
186
  record = {
187
  "File": filename,
188
  "Plot Number": "",
 
193
  "VOS": "",
194
  "Raw Output": raw_text,
195
  }
196
+ for part in raw_text.split("|"):
 
 
 
197
  part = part.strip()
198
  if ":" not in part:
199
  continue
200
  key, _, val = part.partition(":")
201
  key = key.strip().upper()
202
  val = val.strip()
 
203
  if key == "PLOT":
204
  record["Plot Number"] = val
205
  elif key == "LOC":
 
207
  elif key == "AREA":
208
  record["Area"] = val
209
  elif key == "AMT":
 
210
  try:
211
  record["Amount (KES)"] = int(val.replace(",", "").replace(" ", ""))
212
  except ValueError:
 
215
  record["Date"] = val
216
  elif key == "VOS":
217
  record["VOS"] = val
 
218
  return record
219
 
220
 
221
+ # ═══════════════════════════════════════════════════════════
222
+ # EXCEL EXPORT
223
+ # ═══════════════════════════════════════════════════════════
 
 
 
 
 
 
 
 
 
 
224
  def make_excel(records: list) -> bytes:
 
225
  from openpyxl import load_workbook
226
  from openpyxl.styles import Font, PatternFill, Alignment
227
  from openpyxl.utils import get_column_letter
 
 
228
  clean = [{k: v for k, v in r.items() if k != "Raw Output"} for r in records]
 
229
  buf = io.BytesIO()
230
  pd.DataFrame(clean).to_excel(buf, index=False, sheet_name="Valuation Data")
231
  buf.seek(0)
 
232
  wb = load_workbook(buf)
233
  ws = wb.active
234
  hdr = PatternFill("solid", start_color="1A1A2E")
 
235
  for ci, cell in enumerate(ws[1], 1):
236
  cell.font = Font(name="Calibri", bold=True, color="FFFFFF", size=11)
237
  cell.fill = hdr
238
  cell.alignment = Alignment(horizontal="center", vertical="center")
239
  ws.column_dimensions[get_column_letter(ci)].width = 26
 
240
  ws.row_dimensions[1].height = 30
 
241
  for row in ws.iter_rows(min_row=2):
242
  for cell in row:
243
  cell.alignment = Alignment(vertical="center", wrap_text=True)
244
  if cell.row % 2 == 0:
245
  cell.fill = PatternFill("solid", start_color="F0F4FF")
 
246
  ws.freeze_panes = "A2"
247
  out = io.BytesIO()
248
  wb.save(out)
 
258
 
259
 
260
  # ═══════════════════════════════════════════════════════════
261
+ # UI
262
  # ═══════════════════════════════════════════════════════════
263
  st.markdown("""
264
  <div class="topbar">
 
273
  </div>
274
  """, unsafe_allow_html=True)
275
 
 
 
 
 
276
  st.markdown("""
277
  <div class="headline">
278
  Digitise handwritten<br>valuation sheets <em>instantly.</em>
 
284
  </div>
285
  """, unsafe_allow_html=True)
286
 
 
 
 
 
287
  st.markdown('<div class="step">Step 1 β€” Upload Documents</div>', unsafe_allow_html=True)
288
 
289
  uploaded = st.file_uploader(
 
330
  try:
331
  ext = fname.lower().rsplit(".", 1)[-1]
332
 
 
333
  if ext == "pdf":
334
  imgs = pdf_to_images(raw)
335
  else:
336
  imgs = [Image.open(io.BytesIO(raw)).convert("RGB")]
337
 
338
+ if not imgs:
339
+ st.session_state.errors.append(f"{fname}: no pages could be extracted")
340
+ continue
341
+
342
  for pi, img in enumerate(imgs, 1):
343
  status.caption(
344
+ f"Processing **{fname}** β€” page {pi} of {len(imgs)}"
 
345
  )
346
  raw_text = ocr_page(img)
347
  record = parse_label(raw_text, fname)
 
372
  if st.session_state.done and st.session_state.records:
373
  records = st.session_state.records
374
  df = pd.DataFrame(records)
 
 
375
  display_cols = [c for c in df.columns if c != "Raw Output"]
376
  df_display = df[display_cols]
377
 
 
378
  n_plots = df["Plot Number"].astype(bool).sum()
379
  n_amounts = pd.to_numeric(df["Amount (KES)"], errors="coerce").notna().sum()
380
  n_dates = df["Date"].astype(bool).sum()
381
 
382
  st.markdown(f"""
383
  <div class="stats-strip">
384
+ <div class="stat-item"><div class="stat-n">{len(records)}</div><div class="stat-l">Pages processed</div></div>
385
+ <div class="stat-item"><div class="stat-n">{n_plots}</div><div class="stat-l">Plot numbers</div></div>
386
+ <div class="stat-item"><div class="stat-n">{n_amounts}</div><div class="stat-l">Amounts extracted</div></div>
387
+ <div class="stat-item"><div class="stat-n">{n_dates}</div><div class="stat-l">Dates captured</div></div>
 
 
 
 
 
 
 
 
 
 
 
 
388
  </div>
389
  """, unsafe_allow_html=True)
390
 
 
391
  col_t, col_d = st.columns([5, 1])
392
  with col_t:
393
+ st.markdown('<div class="section-head"><div class="section-title">Extracted Records</div></div>', unsafe_allow_html=True)
 
 
 
 
394
  with col_d:
395
  st.markdown('<div style="padding-top:0.3rem"></div>', unsafe_allow_html=True)
396
  if st.session_state.excel:
 
401
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
402
  )
403
 
404
+ st.dataframe(df_display, use_container_width=True,
405
+ height=min(80 + len(df)*38, 560), hide_index=True)
 
 
 
 
 
406
 
407
+ with st.expander("πŸ” Raw model output (for verification)"):
 
408
  for r in records:
409
  st.markdown(
410
  f'<div style="font-family:monospace;font-size:0.78rem;'
 
413
  unsafe_allow_html=True,
414
  )
415
 
 
416
  if st.session_state.errors:
417
+ with st.expander(f"⚠ {len(st.session_state.errors)} file(s) could not be processed"):
418
  for e in st.session_state.errors:
419
+ st.caption(e)