martinofumagalli commited on
Commit
36cf4fd
Β·
verified Β·
1 Parent(s): 97d6f99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -14
app.py CHANGED
@@ -54,6 +54,23 @@ def run_ocr(pdf_bytes: bytes, lang: str, dpi: int, tesseract_cmd: str | None) ->
54
  texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
55
  return texts
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  # ======================================================================
58
  # PARSING DOMINIO (euristiche/regex leggere)
59
  # ======================================================================
@@ -85,7 +102,7 @@ def material_from(text: str) -> str:
85
  m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
86
  return m.group(1).strip() if m else ""
87
 
88
- # --- WEIGHT PARSER β†’ restituisce solo il valore, es. "94Β±3g" -------------
89
  WEIGHT_VALUE_RE = re.compile(
90
  r"""(?ix)
91
  \bweight\b
@@ -127,9 +144,7 @@ def weight_from(text: str) -> str:
127
  return _normalize_weight(m2.group(1))
128
  return ""
129
 
130
- # ---------------------------------------------------------------------------
131
-
132
- # --------------------- AGGIUNTA RICHIESTA: PIECE da "Packaging Component Type" ---------------------
133
  _ALLOWED_PIECES = {
134
  "ribbon": "ribbon",
135
  "bottle": "bottle",
@@ -168,10 +183,6 @@ def _normalize_piece(s: str) -> str:
168
  return ""
169
 
170
  def piece_from(text: str, cls: str) -> str:
171
- """
172
- 1) Cerca 'Packaging Component Type: <valore>' e normalizza al set richiesto.
173
- 2) Se non trovato, usa vecchi fallback (Class/Material Type).
174
- """
175
  # 1) Packaging Component Type (linea dedicata)
176
  m = _PACK_COMP_TYPE_RE.search(text or "")
177
  if m:
@@ -283,7 +294,7 @@ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
283
  gcas = material_ref_gcas_from(full)
284
  mfam = material_family_from(full)
285
 
286
- # (AGGIUNTA) estrai peso
287
  wght = weight_from(full)
288
 
289
  return {
@@ -309,7 +320,7 @@ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
309
  # ======================================================================
310
  st.set_page_config(page_title="PDF β†’ Table (OCR-ready)", layout="wide")
311
  st.title("πŸ“„β†’πŸ“Š PDF β†’ Table (OCR-ready)")
312
- st.caption("Carica PDF (anche scansioni). Compilo la tabella con i campi richiesti; OCR come fallback.")
313
 
314
  with st.sidebar:
315
  files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
@@ -337,13 +348,16 @@ for up in files:
337
  try:
338
  raw = up.read()
339
  pages = extract_text_pages(raw)
 
 
340
  if ocr_fallback and not any((p or "").strip() for p in pages):
341
  pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
 
342
  rec = parse_record(pages, up.name)
343
- # Se Weight Γ¨ vuoto, prova un pass OCR dedicato solo per il peso
 
344
  if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
345
- ocr_pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
346
- w_ocr = weight_from("\n".join(ocr_pages))
347
  if w_ocr:
348
  rec["Weight"] = w_ocr
349
 
@@ -368,4 +382,3 @@ with c2:
368
  with pd.ExcelWriter(bio, engine="openpyxl") as xw:
369
  df.to_excel(xw, index=False, sheet_name="data")
370
  st.download_button("⬇️ Excel", bio.getvalue(), "table.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
371
-
 
54
  texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
55
  return texts
56
 
57
+ # --- OCR rapido SOLO per il peso (prime pagine, DPI bassi, stop appena trovato)
58
+ def run_ocr_for_weight(pdf_bytes: bytes, lang: str, tesseract_cmd: str | None, max_pages: int = 2, dpi_weight: int = 200) -> str:
59
+ if tesseract_cmd:
60
+ pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
61
+ images = convert_from_bytes(pdf_bytes, dpi=dpi_weight, first_page=1, last_page=max_pages)
62
+ config = "--psm 6 -c preserve_interword_spaces=1"
63
+ acc = []
64
+ for img in images:
65
+ if not isinstance(img, Image.Image):
66
+ img = img.convert("RGB")
67
+ txt = pytesseract.image_to_string(img, lang=lang, config=config) or ""
68
+ w = weight_from(txt) # definita sotto
69
+ if w:
70
+ return w
71
+ acc.append(txt)
72
+ return weight_from("\n".join(acc)) or ""
73
+
74
  # ======================================================================
75
  # PARSING DOMINIO (euristiche/regex leggere)
76
  # ======================================================================
 
102
  m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
103
  return m.group(1).strip() if m else ""
104
 
105
+ # --- WEIGHT PARSER β†’ restituisce solo il valore, es. "94Β±3g"
106
  WEIGHT_VALUE_RE = re.compile(
107
  r"""(?ix)
108
  \bweight\b
 
144
  return _normalize_weight(m2.group(1))
145
  return ""
146
 
147
+ # --------------------- PIECE da "Packaging Component Type" ---------------------
 
 
148
  _ALLOWED_PIECES = {
149
  "ribbon": "ribbon",
150
  "bottle": "bottle",
 
183
  return ""
184
 
185
  def piece_from(text: str, cls: str) -> str:
 
 
 
 
186
  # 1) Packaging Component Type (linea dedicata)
187
  m = _PACK_COMP_TYPE_RE.search(text or "")
188
  if m:
 
294
  gcas = material_ref_gcas_from(full)
295
  mfam = material_family_from(full)
296
 
297
+ # estrai peso dal testo (se presente come testo digitale)
298
  wght = weight_from(full)
299
 
300
  return {
 
320
  # ======================================================================
321
  st.set_page_config(page_title="PDF β†’ Table (OCR-ready)", layout="wide")
322
  st.title("πŸ“„β†’πŸ“Š PDF β†’ Table (OCR-ready)")
323
+ st.caption("Carica PDF (anche scansioni). Compilo la tabella con i campi richiesti; OCR mirato per il peso.")
324
 
325
  with st.sidebar:
326
  files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
 
348
  try:
349
  raw = up.read()
350
  pages = extract_text_pages(raw)
351
+
352
+ # Se il PDF non ha testo estraibile, fai OCR completo una sola volta
353
  if ocr_fallback and not any((p or "").strip() for p in pages):
354
  pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
355
+
356
  rec = parse_record(pages, up.name)
357
+
358
+ # Se Weight Γ¨ vuoto, OCR rapido sulle prime pagine e stop appena trovato
359
  if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
360
+ w_ocr = run_ocr_for_weight(raw, lang=lang, tesseract_cmd=tess_cmd, max_pages=2, dpi_weight=200)
 
361
  if w_ocr:
362
  rec["Weight"] = w_ocr
363
 
 
382
  with pd.ExcelWriter(bio, engine="openpyxl") as xw:
383
  df.to_excel(xw, index=False, sheet_name="data")
384
  st.download_button("⬇️ Excel", bio.getvalue(), "table.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")