rasmodev commited on
Commit
d41270e
Β·
verified Β·
1 Parent(s): 2d0dda1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -55
app.py CHANGED
@@ -1,19 +1,6 @@
1
  """
2
  ValuationAI β€” Nairobi Valuation Sheet OCR
3
  Model: rasmodev/Handwriting_trocr_model
4
-
5
- PDF processing matches notebook exactly:
6
- - fitz opened via temp file (not stream) matching how training data was built
7
- - Matrix(200/72, 200/72) β€” same DPI as training
8
- - get_pixmap(matrix=mat, alpha=False) β€” same as training
9
- - Image.open(...).convert('RGB') β€” same as training
10
-
11
- Inference matches notebook exactly:
12
- - processor(images=img.convert('RGB'), return_tensors='pt').pixel_values
13
- - model.generate(pixel_values=pv, max_new_tokens=64, num_beams=1)
14
-
15
- Label format from training:
16
- - PLOT: LR 209/617 | LOC: STATE HOUSE AVENUE | AREA: 0.06 | AMT: 52000000 | DATE: 2008-06-17 | VOS: 3872
17
  """
18
  import io, time, logging, tempfile, os
19
  import streamlit as st
@@ -73,7 +60,6 @@ html, body, [class*="css"], .stApp {
73
  border-color: #2563EB !important;
74
  box-shadow: 0 0 0 4px rgba(37,99,235,0.06) !important;
75
  }
76
- [data-testid="stFileUploader"] label { color: #6B7280 !important; font-size: 0.9rem !important; }
77
 
78
  .fchip { display: inline-flex; align-items: center; gap: 5px; background: #EFF6FF; border: 1px solid #BFDBFE; color: #1D4ED8; padding: 0.25rem 0.7rem; border-radius: 6px; font-size: 0.73rem; font-weight: 500; margin: 2px; }
79
 
@@ -98,7 +84,7 @@ html, body, [class*="css"], .stApp {
98
  .stat-l { font-size: 0.68rem; font-weight: 500; letter-spacing: 0.12em; text-transform: uppercase; color: #6B7280; }
99
 
100
  .section-head { display: flex; align-items: center; justify-content: space-between; margin-bottom: 1rem; padding-bottom: 0.75rem; border-bottom: 1px solid #E5E7EB; }
101
- .section-title { font-family: 'Cormorant Garamond', serif; font-size: 1.5rem; font-weight: 600; color: #1A1A2E; letter-spacing: -0.01em; }
102
 
103
  div[data-testid="stDownloadButton"] > button {
104
  background: #fff !important; border: 1.5px solid #1A1A2E !important; color: #1A1A2E !important;
@@ -115,7 +101,7 @@ div[data-testid="stDownloadButton"] > button:hover { background: #1A1A2E !import
115
 
116
 
117
  # ═══════════════════════════════════════════════════════════
118
- # MODEL β€” matches notebook Cell 13 + Cell 28
119
  # ═══════════════════════════════════════════════════════════
120
  @st.cache_resource(show_spinner="Loading recognition model…")
121
  def load_model():
@@ -130,56 +116,26 @@ def load_model():
130
 
131
 
132
  # ═══════════════════════════════════════════════════════════
133
- # PDF β†’ IMAGES β€” matches notebook Cell 10 exactly
134
- # Uses temp file not stream β€” same as training
135
- # Matrix(200/72, 200/72), get_pixmap(alpha=False), convert('RGB')
136
- # ═══════════════════════════════════════════════════════════
137
- def pdf_to_images(file_bytes: bytes) -> list:
138
- import fitz
139
- images = []
140
- # Write to temp file β€” same as training which used file paths
141
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
142
- tmp.write(file_bytes)
143
- tmp_path = tmp.name
144
- try:
145
- doc = fitz.open(tmp_path) # open from path like training
146
- mat = fitz.Matrix(200/72, 200/72) # same DPI as training
147
- for page in doc:
148
- pix = page.get_pixmap(matrix=mat, alpha=False) # same as training
149
- img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") # same as training
150
- images.append(img)
151
- pix = None # free memory immediately like training
152
- doc.close()
153
- finally:
154
- os.unlink(tmp_path)
155
- return images
156
-
157
-
158
- # ═══════════════════════════════════════════════════════════
159
- # OCR β€” matches notebook Cell 18 + Cell 20 inference
160
- # processor(images=img.convert('RGB')) then model.generate
161
- # max_new_tokens=64, num_beams=1 (greedy β€” fast)
162
  # ═══════════════════════════════════════════════════════════
163
  def ocr_page(img: Image.Image) -> str:
164
  import torch
165
  processor, model, device = load_model()
166
- # Exactly as in ValuationDataset.__getitem__
167
  pixel_values = processor(
168
  images=img.convert("RGB"),
169
  return_tensors="pt"
170
  ).pixel_values.to(device)
171
-
172
  with torch.no_grad():
173
  generated = model.generate(
174
  pixel_values=pixel_values,
175
  max_new_tokens=64,
176
- num_beams=1, # greedy β€” fast, matches validation in notebook
177
  )
178
  return processor.batch_decode(generated, skip_special_tokens=True)[0].strip()
179
 
180
 
181
  # ═══════════════════════════════════════════════════════════
182
- # PARSE LABEL β€” matches row_to_label() from notebook Cell 10
183
  # Format: PLOT: ... | LOC: ... | AREA: ... | AMT: ... | DATE: ... | VOS: ...
184
  # ═══════════════════════════════════════════════════════════
185
  def parse_label(raw_text: str, filename: str) -> dict:
@@ -314,6 +270,8 @@ run = st.button(
314
  # PROCESSING
315
  # ═══════════════════════════════════════════════════════════
316
  if run and uploaded:
 
 
317
  st.session_state.records = []
318
  st.session_state.errors = []
319
  st.session_state.done = False
@@ -327,27 +285,50 @@ if run and uploaded:
327
  raw = uf.read()
328
  bar.progress(fi / len(uploaded), text=f"Reading {fname}…")
329
 
 
 
330
  try:
331
  ext = fname.lower().rsplit(".", 1)[-1]
332
 
333
  if ext == "pdf":
334
- imgs = pdf_to_images(raw)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  else:
336
  imgs = [Image.open(io.BytesIO(raw)).convert("RGB")]
 
337
 
338
  if not imgs:
339
- st.session_state.errors.append(f"{fname}: no pages could be extracted")
 
340
  continue
341
 
342
  for pi, img in enumerate(imgs, 1):
343
- status.caption(
344
- f"Processing **{fname}** β€” page {pi} of {len(imgs)}"
345
- )
346
  raw_text = ocr_page(img)
347
- record = parse_label(raw_text, fname)
 
348
  st.session_state.records.append(record)
349
 
350
  except Exception as e:
 
 
351
  st.session_state.errors.append(f"{fname}: {e}")
352
 
353
  bar.progress((fi + 1) / len(uploaded))
 
1
  """
2
  ValuationAI β€” Nairobi Valuation Sheet OCR
3
  Model: rasmodev/Handwriting_trocr_model
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  """
5
  import io, time, logging, tempfile, os
6
  import streamlit as st
 
60
  border-color: #2563EB !important;
61
  box-shadow: 0 0 0 4px rgba(37,99,235,0.06) !important;
62
  }
 
63
 
64
  .fchip { display: inline-flex; align-items: center; gap: 5px; background: #EFF6FF; border: 1px solid #BFDBFE; color: #1D4ED8; padding: 0.25rem 0.7rem; border-radius: 6px; font-size: 0.73rem; font-weight: 500; margin: 2px; }
65
 
 
84
  .stat-l { font-size: 0.68rem; font-weight: 500; letter-spacing: 0.12em; text-transform: uppercase; color: #6B7280; }
85
 
86
  .section-head { display: flex; align-items: center; justify-content: space-between; margin-bottom: 1rem; padding-bottom: 0.75rem; border-bottom: 1px solid #E5E7EB; }
87
+ .section-title { font-family: 'Cormorant Garamond', serif; font-size: 1.5rem; font-weight: 600; color: #1A1A2E; }
88
 
89
  div[data-testid="stDownloadButton"] > button {
90
  background: #fff !important; border: 1.5px solid #1A1A2E !important; color: #1A1A2E !important;
 
101
 
102
 
103
  # ═══════════════════════════════════════════════════════════
104
+ # MODEL
105
  # ═══════════════════════════════════════════════════════════
106
  @st.cache_resource(show_spinner="Loading recognition model…")
107
  def load_model():
 
116
 
117
 
118
  # ═══════════════════════════════════════════════════════════
119
+ # OCR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  # ═══════════════════════════════════════════════════════════
121
  def ocr_page(img: Image.Image) -> str:
122
  import torch
123
  processor, model, device = load_model()
 
124
  pixel_values = processor(
125
  images=img.convert("RGB"),
126
  return_tensors="pt"
127
  ).pixel_values.to(device)
 
128
  with torch.no_grad():
129
  generated = model.generate(
130
  pixel_values=pixel_values,
131
  max_new_tokens=64,
132
+ num_beams=1,
133
  )
134
  return processor.batch_decode(generated, skip_special_tokens=True)[0].strip()
135
 
136
 
137
  # ═══════════════════════════════════════════════════════════
138
+ # PARSE LABEL
139
  # Format: PLOT: ... | LOC: ... | AREA: ... | AMT: ... | DATE: ... | VOS: ...
140
  # ═══════════════════════════════════════════════════════════
141
  def parse_label(raw_text: str, filename: str) -> dict:
 
270
  # PROCESSING
271
  # ═══════════════════════════════════════════════════════════
272
  if run and uploaded:
273
+ import fitz, traceback
274
+
275
  st.session_state.records = []
276
  st.session_state.errors = []
277
  st.session_state.done = False
 
285
  raw = uf.read()
286
  bar.progress(fi / len(uploaded), text=f"Reading {fname}…")
287
 
288
+ st.write(f"πŸ“„ **{fname}** β€” {len(raw):,} bytes")
289
+
290
  try:
291
  ext = fname.lower().rsplit(".", 1)[-1]
292
 
293
  if ext == "pdf":
294
+ # Write to temp file β€” same as training
295
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
296
+ tmp.write(raw)
297
+ tmp_path = tmp.name
298
+
299
+ doc = fitz.open(tmp_path)
300
+ st.write(f" βœ… PDF opened β€” {len(doc)} page(s) found")
301
+
302
+ imgs = []
303
+ mat = fitz.Matrix(200/72, 200/72)
304
+ for page in doc:
305
+ pix = page.get_pixmap(matrix=mat, alpha=False)
306
+ img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
307
+ imgs.append(img)
308
+ pix = None
309
+ doc.close()
310
+ os.unlink(tmp_path)
311
+ st.write(f" βœ… Rasterized {len(imgs)} page image(s)")
312
+
313
  else:
314
  imgs = [Image.open(io.BytesIO(raw)).convert("RGB")]
315
+ st.write(f" βœ… Loaded image")
316
 
317
  if not imgs:
318
+ st.error(f" ❌ No pages extracted from {fname}")
319
+ st.session_state.errors.append(f"{fname}: no pages extracted")
320
  continue
321
 
322
  for pi, img in enumerate(imgs, 1):
323
+ status.caption(f"Running OCR on **{fname}** β€” page {pi} of {len(imgs)}")
 
 
324
  raw_text = ocr_page(img)
325
+ st.write(f" πŸ“ Page {pi} OCR output: `{raw_text}`")
326
+ record = parse_label(raw_text, fname)
327
  st.session_state.records.append(record)
328
 
329
  except Exception as e:
330
+ st.error(f"❌ Error on {fname}: {e}")
331
+ st.code(traceback.format_exc())
332
  st.session_state.errors.append(f"{fname}: {e}")
333
 
334
  bar.progress((fi + 1) / len(uploaded))