rasmodev commited on
Commit
dffffa4
Β·
verified Β·
1 Parent(s): 9139789

Upload 3 files

Browse files
Files changed (2) hide show
  1. Dockerfile +9 -16
  2. app.py +184 -252
Dockerfile CHANGED
@@ -1,21 +1,13 @@
1
- # ─────────────────────────────────────────────────────────
2
- # ValuationAI OCR β€” Docker Image
3
- # Model: rasmodev/Handwriting_trocr_model
4
- # ─────────────────────────────────────────────────────────
5
  FROM python:3.11-slim
6
 
7
  RUN apt-get update && apt-get install -y --no-install-recommends \
8
- libgl1 \
9
- libglib2.0-0 \
10
- libsm6 \
11
- libxext6 \
12
- libxrender-dev \
13
- curl \
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
  RUN useradd -m -u 1000 appuser
17
  WORKDIR /app
18
 
 
19
  RUN pip install --no-cache-dir \
20
  torch==2.2.2+cpu torchvision==0.17.2+cpu \
21
  --extra-index-url https://download.pytorch.org/whl/cpu
@@ -25,17 +17,18 @@ RUN pip install --no-cache-dir -r requirements.txt
25
 
26
  COPY app.py .
27
 
28
- ENV HF_HOME=/app/.cache/huggingface
29
- RUN mkdir -p /app/.cache/huggingface && chown -R appuser:appuser /app
 
 
 
30
 
31
  USER appuser
32
  EXPOSE 7860
33
 
34
- HEALTHCHECK --interval=30s --timeout=10s --start-period=90s \
35
- CMD curl -f http://localhost:7860/_stcore/health || exit 1
36
-
37
  CMD ["streamlit", "run", "app.py", \
38
  "--server.port=7860", \
39
  "--server.address=0.0.0.0", \
40
  "--server.headless=true", \
41
- "--browser.gatherUsageStats=false"]
 
 
 
 
 
 
1
  FROM python:3.11-slim
2
 
3
  RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ libgl1 libglib2.0-0 libsm6 libxext6 libxrender-dev curl \
 
 
 
 
 
5
  && rm -rf /var/lib/apt/lists/*
6
 
7
  RUN useradd -m -u 1000 appuser
8
  WORKDIR /app
9
 
10
+ # CPU PyTorch
11
  RUN pip install --no-cache-dir \
12
  torch==2.2.2+cpu torchvision==0.17.2+cpu \
13
  --extra-index-url https://download.pytorch.org/whl/cpu
 
17
 
18
  COPY app.py .
19
 
20
+ # Cache directories writable by non-root user
21
+ ENV HF_HOME=/tmp/huggingface
22
+ ENV TRANSFORMERS_CACHE=/tmp/huggingface/transformers
23
+ ENV HF_DATASETS_CACHE=/tmp/huggingface/datasets
24
+ RUN mkdir -p /tmp/huggingface && chmod -R 777 /tmp/huggingface
25
 
26
  USER appuser
27
  EXPOSE 7860
28
 
 
 
 
29
  CMD ["streamlit", "run", "app.py", \
30
  "--server.port=7860", \
31
  "--server.address=0.0.0.0", \
32
  "--server.headless=true", \
33
+ "--server.fileWatcherType=none", \
34
+ "--browser.gatherUsageStats=false"]
app.py CHANGED
@@ -1,6 +1,9 @@
1
  """
2
  ValuationAI β€” Nairobi Valuation Sheet OCR
3
  Model: rasmodev/Handwriting_trocr_model
 
 
 
4
  """
5
  import io, time, logging
6
  import streamlit as st
@@ -17,81 +20,41 @@ logging.basicConfig(level=logging.INFO)
17
 
18
  st.markdown("""
19
  <style>
20
- @import url('https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@400;600;700&family=Inter:wght@300;400;500;600&display=swap');
21
 
22
  html, body, [class*="css"], .stApp {
23
  font-family: 'Inter', sans-serif;
24
  background: #F8F7F4;
25
  color: #1A1A2E;
26
  }
27
-
28
  .block-container {
29
  padding: 3rem 4rem !important;
30
  max-width: 1100px !important;
31
  }
32
-
33
  #MainMenu, footer, header { visibility: hidden; }
34
 
35
- /* ── Top bar ── */
36
  .topbar {
37
- display: flex;
38
- align-items: flex-end;
39
  justify-content: space-between;
40
- padding-bottom: 2rem;
41
- margin-bottom: 3rem;
42
  border-bottom: 2px solid #1A1A2E;
43
  }
44
- .logo {
45
- font-family: 'Cormorant Garamond', serif;
46
- font-size: 1.8rem;
47
- font-weight: 700;
48
- color: #1A1A2E;
49
- letter-spacing: -0.02em;
50
- line-height: 1;
51
- }
52
  .logo span { color: #2563EB; }
53
- .logo-sub {
54
- font-size: 0.68rem;
55
- font-weight: 500;
56
- letter-spacing: 0.15em;
57
- text-transform: uppercase;
58
- color: #9CA3AF;
59
- margin-top: 0.3rem;
60
- }
61
- .model-ref {
62
- font-size: 0.7rem;
63
- color: #9CA3AF;
64
- font-weight: 400;
65
- letter-spacing: 0.04em;
66
- text-align: right;
67
- }
68
  .model-ref strong { color: #2563EB; font-weight: 600; }
69
 
70
- /* ── Hero headline ── */
71
- .headline {
72
- font-family: 'Cormorant Garamond', serif;
73
- font-size: 3.4rem;
74
- font-weight: 700;
75
- line-height: 1.08;
76
- letter-spacing: -0.03em;
77
- color: #1A1A2E;
78
- margin-bottom: 1rem;
79
- max-width: 700px;
80
- }
81
- .headline em {
82
- font-style: italic;
83
- color: #2563EB;
84
- }
85
- .subline {
86
- font-size: 0.95rem;
87
- font-weight: 300;
88
- color: #6B7280;
89
- line-height: 1.7;
90
- max-width: 500px;
91
- margin-bottom: 3rem;
92
- }
93
 
94
- /* ── Upload area ── */
95
  [data-testid="stFileUploader"] section {
96
  background: #fff !important;
97
  border: 2px dashed #D1D5DB !important;
@@ -104,166 +67,56 @@ html, body, [class*="css"], .stApp {
104
  border-color: #2563EB !important;
105
  box-shadow: 0 0 0 4px rgba(37,99,235,0.06) !important;
106
  }
107
- [data-testid="stFileUploader"] label {
108
- color: #6B7280 !important;
109
- font-size: 0.9rem !important;
110
- }
111
- [data-testid="stFileUploadDropzone"] p {
112
- color: #6B7280 !important;
113
- }
114
 
115
- /* ── Run button ── */
116
  .stButton > button {
117
- background: #1A1A2E !important;
118
- color: #fff !important;
119
- border: none !important;
120
- border-radius: 8px !important;
121
- padding: 0.85rem 2.5rem !important;
122
- font-family: 'Inter', sans-serif !important;
123
- font-size: 0.88rem !important;
124
- font-weight: 600 !important;
125
- letter-spacing: 0.04em !important;
126
- text-transform: uppercase !important;
127
- transition: all 0.2s !important;
128
- box-shadow: 0 2px 8px rgba(26,26,46,0.2) !important;
129
- width: 100% !important;
130
- }
131
- .stButton > button:hover {
132
- background: #2563EB !important;
133
- box-shadow: 0 4px 16px rgba(37,99,235,0.3) !important;
134
- transform: translateY(-1px) !important;
135
- }
136
- .stButton > button:disabled {
137
- background: #E5E7EB !important;
138
- color: #9CA3AF !important;
139
- box-shadow: none !important;
140
- transform: none !important;
141
- cursor: not-allowed !important;
142
  }
 
 
143
 
144
- /* ── Progress ── */
145
- .stProgress > div > div > div {
146
- background: #2563EB !important;
147
- border-radius: 4px !important;
148
- }
149
- .stProgress > div > div {
150
- background: #E5E7EB !important;
151
- border-radius: 4px !important;
152
- height: 4px !important;
153
- }
154
 
155
- /* ── Stats strip ── */
156
- .stats-strip {
157
- display: flex;
158
- gap: 0;
159
- background: #1A1A2E;
160
- border-radius: 12px;
161
- overflow: hidden;
162
- margin: 2.5rem 0 2rem;
163
- }
164
- .stat-item {
165
- flex: 1;
166
- padding: 1.6rem 2rem;
167
- border-right: 1px solid rgba(255,255,255,0.08);
168
- text-align: left;
169
- }
170
  .stat-item:last-child { border-right: none; }
171
- .stat-n {
172
- font-family: 'Cormorant Garamond', serif;
173
- font-size: 2.6rem;
174
- font-weight: 700;
175
- color: #fff;
176
- line-height: 1;
177
- margin-bottom: 0.3rem;
178
- }
179
- .stat-l {
180
- font-size: 0.68rem;
181
- font-weight: 500;
182
- letter-spacing: 0.12em;
183
- text-transform: uppercase;
184
- color: #6B7280;
185
- }
186
 
187
- /* ── Section heading ── */
188
- .section-head {
189
- display: flex;
190
- align-items: center;
191
- justify-content: space-between;
192
- margin-bottom: 1rem;
193
- padding-bottom: 0.75rem;
194
- border-bottom: 1px solid #E5E7EB;
195
- }
196
- .section-title {
197
- font-family: 'Cormorant Garamond', serif;
198
- font-size: 1.5rem;
199
- font-weight: 600;
200
- color: #1A1A2E;
201
- letter-spacing: -0.01em;
202
- }
203
 
204
- /* ── Download button ── */
205
  div[data-testid="stDownloadButton"] > button {
206
- background: #fff !important;
207
- border: 1.5px solid #1A1A2E !important;
208
- color: #1A1A2E !important;
209
- border-radius: 8px !important;
210
- padding: 0.6rem 1.4rem !important;
211
- font-family: 'Inter', sans-serif !important;
212
- font-weight: 600 !important;
213
- font-size: 0.82rem !important;
214
- letter-spacing: 0.04em !important;
215
- text-transform: uppercase !important;
216
- transition: all 0.2s !important;
217
- box-shadow: none !important;
218
- width: auto !important;
219
- }
220
- div[data-testid="stDownloadButton"] > button:hover {
221
- background: #1A1A2E !important;
222
- color: #fff !important;
223
  }
 
224
 
225
- /* ── Dataframe ── */
226
- [data-testid="stDataFrame"] {
227
- border-radius: 10px !important;
228
- border: 1px solid #E5E7EB !important;
229
- overflow: hidden !important;
230
- box-shadow: 0 1px 4px rgba(0,0,0,0.05) !important;
231
- }
232
-
233
- /* ── File chip ── */
234
- .fchip {
235
- display: inline-flex; align-items: center; gap: 5px;
236
- background: #EFF6FF;
237
- border: 1px solid #BFDBFE;
238
- color: #1D4ED8;
239
- padding: 0.25rem 0.7rem;
240
- border-radius: 6px;
241
- font-size: 0.73rem;
242
- font-weight: 500;
243
- margin: 2px;
244
- }
245
-
246
- /* ── Divider ── */
247
- .rule { height:1px; background:#E5E7EB; margin: 2.5rem 0; }
248
-
249
- /* ── Step tag ── */
250
- .step {
251
- font-size: 0.65rem; font-weight: 700;
252
- letter-spacing: 0.18em; text-transform: uppercase;
253
- color: #2563EB; margin-bottom: 0.5rem;
254
- }
255
-
256
- /* ── Success alert ── */
257
- [data-testid="stAlert"][data-baseweb="notification"] {
258
- border-radius: 10px !important;
259
- border-left: 3px solid #2563EB !important;
260
- }
261
  </style>
262
  """, unsafe_allow_html=True)
263
 
264
 
265
  # ═══════════════════════════════════════════════════════════
266
- # MODEL (cached β€” loads once)
267
  # ═══════════════════════════════════════════════════════════
268
  @st.cache_resource(show_spinner="Loading recognition model…")
269
  def load_model():
@@ -278,67 +131,116 @@ def load_model():
278
 
279
 
280
  # ═══════════════════════════════════════════════════════════
281
- # HELPERS
 
 
282
  # ═══════════════════════════════════════════════════════════
283
- def ocr(img: Image.Image) -> str:
 
284
  import torch
285
  processor, model, device = load_model()
286
- pv = processor(images=img.convert("RGB"), return_tensors="pt").pixel_values.to(device)
 
 
 
287
  with torch.no_grad():
288
- gen = model.generate(pv, max_new_tokens=64)
289
  return processor.batch_decode(gen, skip_special_tokens=True)[0].strip()
290
 
291
- def pdf_pages(file_bytes: bytes) -> list:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  import fitz
293
  doc = fitz.open(stream=file_bytes, filetype="pdf")
294
  mat = fitz.Matrix(200/72, 200/72)
295
- out = []
296
  for i in range(len(doc)):
297
  pix = doc[i].get_pixmap(matrix=mat, alpha=False)
298
- out.append(Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB"))
299
  doc.close()
300
- return out
301
-
302
- def extract(text: str, filename: str) -> dict:
303
- import re
304
- def g(label):
305
- m = re.search(label + r'[:\s]+([^\|\n]{1,80})', text, re.IGNORECASE)
306
- return m.group(1).strip() if m else ""
307
- amounts = re.findall(r'AMT:\s*([\d,]+)', text)
308
- date_m = re.search(r'DATE:\s*(\d{4}-\d{2}-\d{2})', text)
309
- return {
310
- "File": filename,
311
- "Plot Number": g("PLOT"),
312
- "Location": g("LOC"),
313
- "Area": g("AREA"),
314
- "Amount (KES)": int(amounts[0].replace(",","")) if amounts else None,
315
- "Date": date_m.group(1) if date_m else None,
316
- "VOS": g("VOS"),
317
- }
318
 
319
  def make_excel(records: list) -> bytes:
 
320
  from openpyxl import load_workbook
321
  from openpyxl.styles import Font, PatternFill, Alignment
322
  from openpyxl.utils import get_column_letter
 
 
 
 
323
  buf = io.BytesIO()
324
- pd.DataFrame(records).to_excel(buf, index=False, sheet_name="Valuation Data")
325
  buf.seek(0)
 
326
  wb = load_workbook(buf)
327
  ws = wb.active
328
- hdr_fill = PatternFill("solid", start_color="1A1A2E")
 
329
  for ci, cell in enumerate(ws[1], 1):
330
  cell.font = Font(name="Calibri", bold=True, color="FFFFFF", size=11)
331
- cell.fill = hdr_fill
332
  cell.alignment = Alignment(horizontal="center", vertical="center")
333
- ws.column_dimensions[get_column_letter(ci)].width = 24
 
334
  ws.row_dimensions[1].height = 30
 
335
  for row in ws.iter_rows(min_row=2):
336
  for cell in row:
337
  cell.alignment = Alignment(vertical="center", wrap_text=True)
338
  if cell.row % 2 == 0:
339
  cell.fill = PatternFill("solid", start_color="F0F4FF")
 
340
  ws.freeze_panes = "A2"
341
- out = io.BytesIO(); wb.save(out)
 
342
  return out.getvalue()
343
 
344
 
@@ -351,7 +253,7 @@ for k, v in [("records",[]),("excel",None),("done",False),("errors",[])]:
351
 
352
 
353
  # ═══════════════════════════════════════════════════════════
354
- # TOP BAR
355
  # ═══════════════════════════════════════════════════════════
356
  st.markdown("""
357
  <div class="topbar">
@@ -368,7 +270,7 @@ st.markdown("""
368
 
369
 
370
  # ═══════════════════════════════════════════════════════════
371
- # HEADLINE
372
  # ═══════════════════════════════════════════════════════════
373
  st.markdown("""
374
  <div class="headline">
@@ -383,7 +285,7 @@ st.markdown("""
383
 
384
 
385
  # ═══════════════════════════════════════════════════════════
386
- # UPLOAD
387
  # ═══════════════════════════════════════════════════════════
388
  st.markdown('<div class="step">Step 1 β€” Upload Documents</div>', unsafe_allow_html=True)
389
 
@@ -424,30 +326,41 @@ if run and uploaded:
424
  t0 = time.time()
425
 
426
  for fi, uf in enumerate(uploaded):
427
- fname = uf.name
428
- raw = uf.read()
429
  bar.progress(fi / len(uploaded), text=f"Reading {fname}…")
430
 
431
  try:
432
  ext = fname.lower().rsplit(".", 1)[-1]
433
- imgs = pdf_pages(raw) if ext == "pdf" \
434
- else [Image.open(io.BytesIO(raw)).convert("RGB")]
435
 
 
 
 
 
 
 
 
436
  for pi, img in enumerate(imgs, 1):
437
- status.caption(f"Processing **{fname}** β€” page {pi} of {len(imgs)}")
438
- text = ocr(img)
439
- st.session_state.records.append(extract(text, fname))
 
 
 
 
440
 
441
  except Exception as e:
442
  st.session_state.errors.append(f"{fname}: {e}")
443
 
444
- bar.progress((fi+1) / len(uploaded))
445
 
446
- bar.empty(); status.empty()
447
- st.session_state.excel = make_excel(st.session_state.records) \
448
- if st.session_state.records else None
449
- st.session_state.done = True
450
 
 
 
 
 
451
  elapsed = time.time() - t0
452
  st.success(
453
  f"Processed {len(st.session_state.records)} page(s) "
@@ -459,17 +372,22 @@ if run and uploaded:
459
  # RESULTS
460
  # ═══════════════════════════════════════════════════════════
461
  if st.session_state.done and st.session_state.records:
462
- df = pd.DataFrame(st.session_state.records)
 
 
 
 
 
463
 
464
- # Stats strip
465
  n_plots = df["Plot Number"].astype(bool).sum()
466
- n_amounts = df["Amount (KES)"].notna().sum()
467
  n_dates = df["Date"].astype(bool).sum()
468
 
469
  st.markdown(f"""
470
  <div class="stats-strip">
471
  <div class="stat-item">
472
- <div class="stat-n">{len(df)}</div>
473
  <div class="stat-l">Pages processed</div>
474
  </div>
475
  <div class="stat-item">
@@ -487,7 +405,7 @@ if st.session_state.done and st.session_state.records:
487
  </div>
488
  """, unsafe_allow_html=True)
489
 
490
- # Table header + download
491
  col_t, col_d = st.columns([5, 1])
492
  with col_t:
493
  st.markdown("""
@@ -505,9 +423,23 @@ if st.session_state.done and st.session_state.records:
505
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
506
  )
507
 
508
- # Table
509
- st.dataframe(df, use_container_width=True,
510
- height=min(80 + len(df)*38, 560), hide_index=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
 
512
  # Errors
513
  if st.session_state.errors:
 
1
  """
2
  ValuationAI β€” Nairobi Valuation Sheet OCR
3
  Model: rasmodev/Handwriting_trocr_model
4
+
5
+ Label format from training:
6
+ PLOT: LR 209/617 | LOC: STATE HOUSE AVENUE | AREA: 0.06 | AMT: 52000000 | DATE: 2008-06-17 | VOS: 3872
7
  """
8
  import io, time, logging
9
  import streamlit as st
 
20
 
21
  st.markdown("""
22
  <style>
23
+ @import url('https://fonts.googleapis.com/css2?family=Cormorant+Garamond:ital,wght@0,600;0,700;1,600&family=Inter:wght@300;400;500;600&display=swap');
24
 
25
  html, body, [class*="css"], .stApp {
26
  font-family: 'Inter', sans-serif;
27
  background: #F8F7F4;
28
  color: #1A1A2E;
29
  }
 
30
  .block-container {
31
  padding: 3rem 4rem !important;
32
  max-width: 1100px !important;
33
  }
 
34
  #MainMenu, footer, header { visibility: hidden; }
35
 
36
+ /* Top bar */
37
  .topbar {
38
+ display: flex; align-items: flex-end;
 
39
  justify-content: space-between;
40
+ padding-bottom: 2rem; margin-bottom: 3rem;
 
41
  border-bottom: 2px solid #1A1A2E;
42
  }
43
+ .logo { font-family: 'Cormorant Garamond', serif; font-size: 1.8rem; font-weight: 700; color: #1A1A2E; letter-spacing: -0.02em; line-height: 1; }
 
 
 
 
 
 
 
44
  .logo span { color: #2563EB; }
45
+ .logo-sub { font-size: 0.68rem; font-weight: 500; letter-spacing: 0.15em; text-transform: uppercase; color: #9CA3AF; margin-top: 0.3rem; }
46
+ .model-ref { font-size: 0.7rem; color: #9CA3AF; font-weight: 400; letter-spacing: 0.04em; text-align: right; }
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  .model-ref strong { color: #2563EB; font-weight: 600; }
48
 
49
+ /* Headline */
50
+ .headline { font-family: 'Cormorant Garamond', serif; font-size: 3.4rem; font-weight: 700; line-height: 1.08; letter-spacing: -0.03em; color: #1A1A2E; margin-bottom: 1rem; max-width: 700px; }
51
+ .headline em { font-style: italic; color: #2563EB; }
52
+ .subline { font-size: 0.95rem; font-weight: 300; color: #6B7280; line-height: 1.7; max-width: 500px; margin-bottom: 3rem; }
53
+
54
+ /* Step label */
55
+ .step { font-size: 0.65rem; font-weight: 700; letter-spacing: 0.18em; text-transform: uppercase; color: #2563EB; margin-bottom: 0.5rem; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ /* File uploader */
58
  [data-testid="stFileUploader"] section {
59
  background: #fff !important;
60
  border: 2px dashed #D1D5DB !important;
 
67
  border-color: #2563EB !important;
68
  box-shadow: 0 0 0 4px rgba(37,99,235,0.06) !important;
69
  }
70
+ [data-testid="stFileUploader"] label { color: #6B7280 !important; font-size: 0.9rem !important; }
71
+
72
+ /* File chip */
73
+ .fchip { display: inline-flex; align-items: center; gap: 5px; background: #EFF6FF; border: 1px solid #BFDBFE; color: #1D4ED8; padding: 0.25rem 0.7rem; border-radius: 6px; font-size: 0.73rem; font-weight: 500; margin: 2px; }
 
 
 
74
 
75
+ /* Button */
76
  .stButton > button {
77
+ background: #1A1A2E !important; color: #fff !important; border: none !important;
78
+ border-radius: 8px !important; padding: 0.85rem 2.5rem !important;
79
+ font-family: 'Inter', sans-serif !important; font-size: 0.88rem !important;
80
+ font-weight: 600 !important; letter-spacing: 0.04em !important;
81
+ text-transform: uppercase !important; transition: all 0.2s !important;
82
+ box-shadow: 0 2px 8px rgba(26,26,46,0.2) !important; width: 100% !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  }
84
+ .stButton > button:hover { background: #2563EB !important; box-shadow: 0 4px 16px rgba(37,99,235,0.3) !important; transform: translateY(-1px) !important; }
85
+ .stButton > button:disabled { background: #E5E7EB !important; color: #9CA3AF !important; box-shadow: none !important; transform: none !important; }
86
 
87
+ /* Progress */
88
+ .stProgress > div > div > div { background: #2563EB !important; border-radius: 4px !important; }
89
+ .stProgress > div > div { background: #E5E7EB !important; border-radius: 4px !important; height: 4px !important; }
 
 
 
 
 
 
 
90
 
91
+ /* Stats */
92
+ .stats-strip { display: flex; background: #1A1A2E; border-radius: 12px; overflow: hidden; margin: 2.5rem 0 2rem; }
93
+ .stat-item { flex: 1; padding: 1.6rem 2rem; border-right: 1px solid rgba(255,255,255,0.08); }
 
 
 
 
 
 
 
 
 
 
 
 
94
  .stat-item:last-child { border-right: none; }
95
+ .stat-n { font-family: 'Cormorant Garamond', serif; font-size: 2.6rem; font-weight: 700; color: #fff; line-height: 1; margin-bottom: 0.3rem; }
96
+ .stat-l { font-size: 0.68rem; font-weight: 500; letter-spacing: 0.12em; text-transform: uppercase; color: #6B7280; }
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ /* Section head */
99
+ .section-head { display: flex; align-items: center; justify-content: space-between; margin-bottom: 1rem; padding-bottom: 0.75rem; border-bottom: 1px solid #E5E7EB; }
100
+ .section-title { font-family: 'Cormorant Garamond', serif; font-size: 1.5rem; font-weight: 600; color: #1A1A2E; letter-spacing: -0.01em; }
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ /* Download button */
103
  div[data-testid="stDownloadButton"] > button {
104
+ background: #fff !important; border: 1.5px solid #1A1A2E !important; color: #1A1A2E !important;
105
+ border-radius: 8px !important; padding: 0.6rem 1.4rem !important;
106
+ font-family: 'Inter', sans-serif !important; font-weight: 600 !important;
107
+ font-size: 0.82rem !important; letter-spacing: 0.04em !important;
108
+ text-transform: uppercase !important; transition: all 0.2s !important; width: auto !important;
 
 
 
 
 
 
 
 
 
 
 
 
109
  }
110
+ div[data-testid="stDownloadButton"] > button:hover { background: #1A1A2E !important; color: #fff !important; }
111
 
112
+ /* Dataframe */
113
+ [data-testid="stDataFrame"] { border-radius: 10px !important; border: 1px solid #E5E7EB !important; overflow: hidden !important; box-shadow: 0 1px 4px rgba(0,0,0,0.05) !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  </style>
115
  """, unsafe_allow_html=True)
116
 
117
 
118
  # ═══════════════════════════════════════════════════════════
119
+ # MODEL LOADING
120
  # ═══════════════════════════════════════════════════════════
121
  @st.cache_resource(show_spinner="Loading recognition model…")
122
  def load_model():
 
131
 
132
 
133
  # ═══════════════════════════════════════════════════════════
134
+ # OCR β€” matches training output format exactly
135
+ # Training label format:
136
+ # PLOT: LR 209/617 | LOC: STATE HOUSE AVENUE | AREA: 0.06 | AMT: 52000000 | DATE: 2008-06-17 | VOS: 3872
137
  # ═══════════════════════════════════════════════════════════
138
+ def ocr_page(img: Image.Image) -> str:
139
+ """Run the fine-tuned model on one page image."""
140
  import torch
141
  processor, model, device = load_model()
142
+ pv = processor(
143
+ images=img.convert("RGB"),
144
+ return_tensors="pt"
145
+ ).pixel_values.to(device)
146
  with torch.no_grad():
147
+ gen = model.generate(pv, max_new_tokens=128)
148
  return processor.batch_decode(gen, skip_special_tokens=True)[0].strip()
149
 
150
+
151
+ def parse_label(raw_text: str, filename: str) -> dict:
152
+ """
153
+ Parse the pipe-delimited label that the model was trained to output.
154
+ Format: PLOT: ... | LOC: ... | AREA: ... | AMT: ... | DATE: ... | VOS: ...
155
+ """
156
+ record = {
157
+ "File": filename,
158
+ "Plot Number": "",
159
+ "Location": "",
160
+ "Area": "",
161
+ "Amount (KES)": None,
162
+ "Date": "",
163
+ "VOS": "",
164
+ "Raw Output": raw_text,
165
+ }
166
+
167
+ # Split on pipe delimiter
168
+ parts = raw_text.split("|")
169
+ for part in parts:
170
+ part = part.strip()
171
+ if ":" not in part:
172
+ continue
173
+ key, _, val = part.partition(":")
174
+ key = key.strip().upper()
175
+ val = val.strip()
176
+
177
+ if key == "PLOT":
178
+ record["Plot Number"] = val
179
+ elif key == "LOC":
180
+ record["Location"] = val
181
+ elif key == "AREA":
182
+ record["Area"] = val
183
+ elif key == "AMT":
184
+ # Remove commas and convert to int
185
+ try:
186
+ record["Amount (KES)"] = int(val.replace(",", "").replace(" ", ""))
187
+ except ValueError:
188
+ record["Amount (KES)"] = val
189
+ elif key == "DATE":
190
+ record["Date"] = val
191
+ elif key == "VOS":
192
+ record["VOS"] = val
193
+
194
+ return record
195
+
196
+
197
+ def pdf_to_images(file_bytes: bytes) -> list:
198
+ """Convert all pages of a PDF to PIL images at 200 DPI."""
199
  import fitz
200
  doc = fitz.open(stream=file_bytes, filetype="pdf")
201
  mat = fitz.Matrix(200/72, 200/72)
202
+ imgs = []
203
  for i in range(len(doc)):
204
  pix = doc[i].get_pixmap(matrix=mat, alpha=False)
205
+ imgs.append(Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB"))
206
  doc.close()
207
+ return imgs
208
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
  def make_excel(records: list) -> bytes:
211
+ """Export records to a formatted Excel workbook."""
212
  from openpyxl import load_workbook
213
  from openpyxl.styles import Font, PatternFill, Alignment
214
  from openpyxl.utils import get_column_letter
215
+
216
+ # Drop raw output from Excel β€” it's only for debugging
217
+ clean = [{k: v for k, v in r.items() if k != "Raw Output"} for r in records]
218
+
219
  buf = io.BytesIO()
220
+ pd.DataFrame(clean).to_excel(buf, index=False, sheet_name="Valuation Data")
221
  buf.seek(0)
222
+
223
  wb = load_workbook(buf)
224
  ws = wb.active
225
+ hdr = PatternFill("solid", start_color="1A1A2E")
226
+
227
  for ci, cell in enumerate(ws[1], 1):
228
  cell.font = Font(name="Calibri", bold=True, color="FFFFFF", size=11)
229
+ cell.fill = hdr
230
  cell.alignment = Alignment(horizontal="center", vertical="center")
231
+ ws.column_dimensions[get_column_letter(ci)].width = 26
232
+
233
  ws.row_dimensions[1].height = 30
234
+
235
  for row in ws.iter_rows(min_row=2):
236
  for cell in row:
237
  cell.alignment = Alignment(vertical="center", wrap_text=True)
238
  if cell.row % 2 == 0:
239
  cell.fill = PatternFill("solid", start_color="F0F4FF")
240
+
241
  ws.freeze_panes = "A2"
242
+ out = io.BytesIO()
243
+ wb.save(out)
244
  return out.getvalue()
245
 
246
 
 
253
 
254
 
255
  # ═══════════════════════════════════════════════════════════
256
+ # UI β€” TOP BAR
257
  # ═══════════════════════════════════════════════════════════
258
  st.markdown("""
259
  <div class="topbar">
 
270
 
271
 
272
  # ═══════════════════════════════════════════════════════════
273
+ # UI β€” HEADLINE
274
  # ═══════════════════════════════════════════════════════════
275
  st.markdown("""
276
  <div class="headline">
 
285
 
286
 
287
  # ═══════════════════════════════════════════════════════════
288
+ # UI β€” UPLOAD
289
  # ═══════════════════════════════════════════════════════════
290
  st.markdown('<div class="step">Step 1 β€” Upload Documents</div>', unsafe_allow_html=True)
291
 
 
326
  t0 = time.time()
327
 
328
  for fi, uf in enumerate(uploaded):
329
+ fname = uf.name
330
+ raw = uf.read()
331
  bar.progress(fi / len(uploaded), text=f"Reading {fname}…")
332
 
333
  try:
334
  ext = fname.lower().rsplit(".", 1)[-1]
 
 
335
 
336
+ # Get page images
337
+ if ext == "pdf":
338
+ imgs = pdf_to_images(raw)
339
+ else:
340
+ imgs = [Image.open(io.BytesIO(raw)).convert("RGB")]
341
+
342
+ # Run OCR on each page
343
  for pi, img in enumerate(imgs, 1):
344
+ status.caption(
345
+ f"Running recognition on **{fname}** β€” "
346
+ f"page {pi} of {len(imgs)}"
347
+ )
348
+ raw_text = ocr_page(img)
349
+ record = parse_label(raw_text, fname)
350
+ st.session_state.records.append(record)
351
 
352
  except Exception as e:
353
  st.session_state.errors.append(f"{fname}: {e}")
354
 
355
+ bar.progress((fi + 1) / len(uploaded))
356
 
357
+ bar.empty()
358
+ status.empty()
 
 
359
 
360
+ if st.session_state.records:
361
+ st.session_state.excel = make_excel(st.session_state.records)
362
+
363
+ st.session_state.done = True
364
  elapsed = time.time() - t0
365
  st.success(
366
  f"Processed {len(st.session_state.records)} page(s) "
 
372
  # RESULTS
373
  # ═══════════════════════════════════════════════════════════
374
  if st.session_state.done and st.session_state.records:
375
+ records = st.session_state.records
376
+ df = pd.DataFrame(records)
377
+
378
+ # Display columns β€” exclude raw output from table
379
+ display_cols = [c for c in df.columns if c != "Raw Output"]
380
+ df_display = df[display_cols]
381
 
382
+ # Stats
383
  n_plots = df["Plot Number"].astype(bool).sum()
384
+ n_amounts = pd.to_numeric(df["Amount (KES)"], errors="coerce").notna().sum()
385
  n_dates = df["Date"].astype(bool).sum()
386
 
387
  st.markdown(f"""
388
  <div class="stats-strip">
389
  <div class="stat-item">
390
+ <div class="stat-n">{len(records)}</div>
391
  <div class="stat-l">Pages processed</div>
392
  </div>
393
  <div class="stat-item">
 
405
  </div>
406
  """, unsafe_allow_html=True)
407
 
408
+ # Table header + download side by side
409
  col_t, col_d = st.columns([5, 1])
410
  with col_t:
411
  st.markdown("""
 
423
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
424
  )
425
 
426
+ # Dataframe
427
+ st.dataframe(
428
+ df_display,
429
+ use_container_width=True,
430
+ height=min(80 + len(df) * 38, 560),
431
+ hide_index=True,
432
+ )
433
+
434
+ # Raw OCR output β€” for verification/debugging
435
+ with st.expander("πŸ” View raw model output (for verification)"):
436
+ for r in records:
437
+ st.markdown(
438
+ f'<div style="font-family:monospace;font-size:0.78rem;'
439
+ f'padding:0.5rem 0;border-bottom:1px solid #E5E7EB;color:#374151">'
440
+ f'<strong>{r["File"]}</strong><br>{r.get("Raw Output","")}</div>',
441
+ unsafe_allow_html=True,
442
+ )
443
 
444
  # Errors
445
  if st.session_state.errors: