Spaces:

TuttiQuantis
/

PPWR_APP

Sleeping

App Files Files Community

martinofumagalli commited on Nov 10, 2025

Commit

36cf4fd

verified ·

1 Parent(s): 97d6f99

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -14

app.py CHANGED Viewed

@@ -54,6 +54,23 @@ def run_ocr(pdf_bytes: bytes, lang: str, dpi: int, tesseract_cmd: str | None) ->
         texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
     return texts
 # ======================================================================
 # PARSING DOMINIO (euristiche/regex leggere)
 # ======================================================================
@@ -85,7 +102,7 @@ def material_from(text: str) -> str:
     m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
     return m.group(1).strip() if m else ""
-# --- WEIGHT PARSER → restituisce solo il valore, es. "94±3g" -------------
 WEIGHT_VALUE_RE = re.compile(
     r"""(?ix)
     \bweight\b
@@ -127,9 +144,7 @@ def weight_from(text: str) -> str:
                 return _normalize_weight(m2.group(1))
     return ""
-# ---------------------------------------------------------------------------
-# ---------------------  AGGIUNTA RICHIESTA: PIECE da "Packaging Component Type"  ---------------------
 _ALLOWED_PIECES = {
     "ribbon": "ribbon",
     "bottle": "bottle",
@@ -168,10 +183,6 @@ def _normalize_piece(s: str) -> str:
     return ""
 def piece_from(text: str, cls: str) -> str:
-    """
-    1) Cerca 'Packaging Component Type: <valore>' e normalizza al set richiesto.
-    2) Se non trovato, usa vecchi fallback (Class/Material Type).
-    """
     # 1) Packaging Component Type (linea dedicata)
     m = _PACK_COMP_TYPE_RE.search(text or "")
     if m:
@@ -283,7 +294,7 @@ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
     gcas  = material_ref_gcas_from(full)
     mfam  = material_family_from(full)
-    # (AGGIUNTA) estrai peso
     wght  = weight_from(full)
     return {
@@ -309,7 +320,7 @@ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
 # ======================================================================
 st.set_page_config(page_title="PDF → Table (OCR-ready)", layout="wide")
 st.title("📄→📊 PDF → Table (OCR-ready)")
-st.caption("Carica PDF (anche scansioni). Compilo la tabella con i campi richiesti; OCR come fallback.")
 with st.sidebar:
     files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
@@ -337,13 +348,16 @@ for up in files:
     try:
         raw = up.read()
         pages = extract_text_pages(raw)
         if ocr_fallback and not any((p or "").strip() for p in pages):
             pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
         rec = parse_record(pages, up.name)
-        # Se Weight è vuoto, prova un pass OCR dedicato solo per il peso
         if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
-            ocr_pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
-            w_ocr = weight_from("\n".join(ocr_pages))
             if w_ocr:
                 rec["Weight"] = w_ocr
@@ -368,4 +382,3 @@ with c2:
     with pd.ExcelWriter(bio, engine="openpyxl") as xw:
         df.to_excel(xw, index=False, sheet_name="data")
     st.download_button("⬇️ Excel", bio.getvalue(), "table.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")

         texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
     return texts
+# --- OCR rapido SOLO per il peso (prime pagine, DPI bassi, stop appena trovato)
+def run_ocr_for_weight(pdf_bytes: bytes, lang: str, tesseract_cmd: str | None, max_pages: int = 2, dpi_weight: int = 200) -> str:
+    if tesseract_cmd:
+        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
+    images = convert_from_bytes(pdf_bytes, dpi=dpi_weight, first_page=1, last_page=max_pages)
+    config = "--psm 6 -c preserve_interword_spaces=1"
+    acc = []
+    for img in images:
+        if not isinstance(img, Image.Image):
+            img = img.convert("RGB")
+        txt = pytesseract.image_to_string(img, lang=lang, config=config) or ""
+        w = weight_from(txt)  # definita sotto
+        if w:
+            return w
+        acc.append(txt)
+    return weight_from("\n".join(acc)) or ""
 # ======================================================================
 # PARSING DOMINIO (euristiche/regex leggere)
 # ======================================================================
     m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
     return m.group(1).strip() if m else ""
+# --- WEIGHT PARSER → restituisce solo il valore, es. "94±3g"
 WEIGHT_VALUE_RE = re.compile(
     r"""(?ix)
     \bweight\b
                 return _normalize_weight(m2.group(1))
     return ""
+# ---------------------  PIECE da "Packaging Component Type"  ---------------------
 _ALLOWED_PIECES = {
     "ribbon": "ribbon",
     "bottle": "bottle",
     return ""
 def piece_from(text: str, cls: str) -> str:
     # 1) Packaging Component Type (linea dedicata)
     m = _PACK_COMP_TYPE_RE.search(text or "")
     if m:
     gcas  = material_ref_gcas_from(full)
     mfam  = material_family_from(full)
+    # estrai peso dal testo (se presente come testo digitale)
     wght  = weight_from(full)
     return {
 # ======================================================================
 st.set_page_config(page_title="PDF → Table (OCR-ready)", layout="wide")
 st.title("📄→📊 PDF → Table (OCR-ready)")
+st.caption("Carica PDF (anche scansioni). Compilo la tabella con i campi richiesti; OCR mirato per il peso.")
 with st.sidebar:
     files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
     try:
         raw = up.read()
         pages = extract_text_pages(raw)
+        # Se il PDF non ha testo estraibile, fai OCR completo una sola volta
         if ocr_fallback and not any((p or "").strip() for p in pages):
             pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
         rec = parse_record(pages, up.name)
+        # Se Weight è vuoto, OCR rapido sulle prime pagine e stop appena trovato
         if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
+            w_ocr = run_ocr_for_weight(raw, lang=lang, tesseract_cmd=tess_cmd, max_pages=2, dpi_weight=200)
             if w_ocr:
                 rec["Weight"] = w_ocr
     with pd.ExcelWriter(bio, engine="openpyxl") as xw:
         df.to_excel(xw, index=False, sheet_name="data")
     st.download_button("⬇️ Excel", bio.getvalue(), "table.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")