Spaces:

TuttiQuantis
/

PPWR_APP

Sleeping

App Files Files Community

martinofumagalli commited on Oct 2, 2025

Commit

7146dfb

verified ·

1 Parent(s): 04bae7d

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -28

app.py CHANGED Viewed

@@ -1,21 +1,29 @@
 import io, os, re
-from typing import List, Dict, Tuple
 import streamlit as st
 import pandas as pd
-# PDF text
 import pdfplumber
 from pypdf import PdfReader
-# OCR
 from pdf2image import convert_from_bytes
 import pytesseract
 from PIL import Image
-SCHEMA = ["Piece","SKU","Title","Capacity","% Recycled","Weight","Color","Material / Resin","Class","Source File"]
-# ------------------ low-level extractors ------------------
 def extract_text_pages(pdf_bytes: bytes) -> List[str]:
     pages = []
     # 1) pdfplumber
@@ -46,13 +54,14 @@ def run_ocr(pdf_bytes: bytes, lang: str, dpi: int, tesseract_cmd: str | None) ->
         texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
     return texts
-# ------------------ domain parsing ------------------
-SKU_RE = re.compile(r"\b(?:Name|SKU|Part(?:\s*No\.?)?)\s*[:#]?\s*([A-Z0-9\-_/\.]{5,})", re.I)
-TITLE_RE = re.compile(r"\bTitle\s*[:\-]\s*(.+)", re.I)
-CLASS_RE = re.compile(r"\bClass\s*([A-Za-z ]+)", re.I)
-def first(text: str, pattern: re.Pattern, group: int = 1) -> str:
     m = pattern.search(text or "")
     return m.group(group).strip() if m else ""
@@ -63,14 +72,13 @@ def capacity_from(text: str) -> str:
     return f"{m.group(1).replace(',', '.')} {unit}"
 def color_from(text: str) -> str:
-    # preferisci "Part Color" / "Color" oppure parole in MAIUSCOLO vicino a GREEN/TRANSPARENT ecc.
     m = re.search(r"(?:Part\s*Color|Color)\s*[:\-]?\s*([A-Z ]{3,})", text, re.I)
     if m: return m.group(1).strip()
     m = re.search(r"\b([A-Z ]{4,}(?:GREEN|TRANSPARENT|WHITE|BLACK|BLUE|RED|CLEAR)[A-Z ]*)\b", text)
     return (m.group(1).strip() if m else "")
 def material_from(text: str) -> str:
-    # cattura righe con "RESIN" o "SERIOPLAST ... RESIN"
     for line in (text or "").splitlines():
         if re.search(r"\bRESIN\b", line, re.I):
             return line.strip()
@@ -78,30 +86,100 @@ def material_from(text: str) -> str:
     return m.group(1).strip() if m else ""
 def piece_from(text: str, cls: str) -> str:
-    # se trovi "Packaging Material Type Rigid- Bottle" → "Bottle"
     m = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text, re.I)
     if m:
         seg = m.group(1)
-        m2 = re.search(r"\b(Bottle|Cap|Container|Lid|Carton|Case)\b", seg, re.I)
         if m2: return m2.group(1).capitalize()
-    # dal Class "Bottles" → "Bottle", "Corrugated" → "Container"
     if cls:
         if "Bottle" in cls or "Bottles" in cls: return "Bottle"
         if "Cap" in cls or "Closures" in cls: return "Cap"
         if "Corrugated" in cls: return "Container"
     return ""
 def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
     full = "\n".join(pages or [""])
-    sku = first(full, SKU_RE)
-    title = first(full, TITLE_RE)
-    cls  = first(full, CLASS_RE)
-    cap  = capacity_from(title) or capacity_from(full)
     color = color_from(full)
     material = material_from(full)
     piece = piece_from(full, cls)
-    rec = {
         "Piece": piece or "",
         "SKU": sku or "",
         "Title": title or "",
@@ -112,14 +190,19 @@ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
         "Material / Resin": material or "",
         "Class": cls or "",
         "Source File": source_name,
     }
-    return rec
-# ------------------ UI ------------------
-st.set_page_config(page_title="PDF → Table (OCR)", layout="wide")
 st.title("📄→📊 PDF → Table (OCR-ready)")
-st.caption("Carica PDF (anche scansioni). Per ogni file compilo: Piece, SKU, Title, Capacity, % Recycled, Weight, Color, Material / Resin, Class, Source File.")
 with st.sidebar:
     files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
@@ -150,7 +233,6 @@ for up in files:
         if ocr_fallback and not any((p or "").strip() for p in pages):
             pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
         rec = parse_record(pages, up.name)
-        # se nel futuro ci saranno più SKU in un PDF, qui potremmo generare più rec (lista)
         rows.append(rec)
     except Exception as e:
         errors.append((up.name, str(e)))

 import io, os, re
+from typing import List, Dict
 import streamlit as st
 import pandas as pd
+# --- PDF text
 import pdfplumber
 from pypdf import PdfReader
+# --- OCR
 from pdf2image import convert_from_bytes
 import pytesseract
 from PIL import Image
+# ======================================================================
+# SCHEMA TABELLA (colonne fisse)
+# ======================================================================
+SCHEMA = [
+    "Piece","SKU","Title","Capacity","% Recycled","Weight","Color","Material / Resin","Class","Source File",
+    # nuove colonne
+    "Component","Function","General description of the packaging","Material Ref GCAS","Material Family"
+]
+# ======================================================================
+# ESTRATTORI LOW-LEVEL
+# ======================================================================
 def extract_text_pages(pdf_bytes: bytes) -> List[str]:
     pages = []
     # 1) pdfplumber
         texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
     return texts
+# ======================================================================
+# PARSING DOMINIO (euristiche/regex leggere)
+# ======================================================================
+SKU_RE    = re.compile(r"\b(?:Name|SKU|Part(?:\s*No\.?)?)\s*[:#]?\s*([A-Z0-9\-_/\.]{5,})", re.I)
+TITLE_RE  = re.compile(r"\bTitle\s*[:\-]\s*(.+)", re.I)
+CLASS_RE  = re.compile(r"\bClass\s*([A-Za-z ]+)", re.I)
+def _first(text: str, pattern: re.Pattern, group: int = 1) -> str:
     m = pattern.search(text or "")
     return m.group(group).strip() if m else ""
     return f"{m.group(1).replace(',', '.')} {unit}"
 def color_from(text: str) -> str:
     m = re.search(r"(?:Part\s*Color|Color)\s*[:\-]?\s*([A-Z ]{3,})", text, re.I)
     if m: return m.group(1).strip()
     m = re.search(r"\b([A-Z ]{4,}(?:GREEN|TRANSPARENT|WHITE|BLACK|BLUE|RED|CLEAR)[A-Z ]*)\b", text)
     return (m.group(1).strip() if m else "")
 def material_from(text: str) -> str:
+    # cattura righe con "RESIN" o frasi simili
     for line in (text or "").splitlines():
         if re.search(r"\bRESIN\b", line, re.I):
             return line.strip()
     return m.group(1).strip() if m else ""
 def piece_from(text: str, cls: str) -> str:
     m = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text, re.I)
     if m:
         seg = m.group(1)
+        m2 = re.search(r"\b(Bottle|Cap|Container|Lid|Carton|Case|Label|Tape)\b", seg, re.I)
         if m2: return m2.group(1).capitalize()
     if cls:
         if "Bottle" in cls or "Bottles" in cls: return "Bottle"
         if "Cap" in cls or "Closures" in cls: return "Cap"
         if "Corrugated" in cls: return "Container"
+        if "Label" in cls: return "Label"
+    return ""
+# --- Nuove colonne: euristiche base (si possono migliorare con esempi reali)
+FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
+def component_from(text: str, piece: str, cls: str) -> str:
+    txt = text.lower()
+    # priorità a keyword esplicite
+    if "ink" in txt and "cartridge" in txt: return "Ink cartridge"
+    if "ink foil" in txt: return "Ink foil"
+    if "tape" in txt: return "Tape"
+    if "label" in txt and ("psl" in txt or "wet glue" in txt or "iml" in txt or "htl" in txt): return "Labels"
+    if "adhesive" in txt or "hot melt" in txt: return "Adhesive"
+    if "cartonboard" in txt or "sheet" in txt: return "Cartonboard / Sheet"
+    if "corrugated" in txt or "case" in txt or "outercase" in txt: return "Corrugated box"
+    if "bundle" in txt: return "Bundle"
+    # fallback da piece/class
+    if piece: return piece
+    if cls:
+        if "bottle" in cls.lower(): return "Bottle"
+        if "cap" in cls.lower(): return "Closure"
+        if "corrugated" in cls.lower(): return "Corrugated box"
+        if "label" in cls.lower(): return "Labels"
+    return ""
+def function_from(text: str) -> str:
+    m = FUNCTION_RE.search(text or "")
+    return m.group(1).title() if m else ""
+def material_ref_gcas_from(text: str) -> str:
+    # codici tipo 8 cifre (es. 90082546) o due codici tra parentesi
+    m = re.findall(r"\b(\d{7,9})\b", text or "")
+    if m:
+        # dedup conservando ordine
+        seen = set(); out=[]
+        for x in m:
+            if x not in seen:
+                seen.add(x); out.append(x)
+        return ", ".join(out[:3])  # limita a 3 per non esagerare
+    # anche pattern "(\d{5,}) (xx kg pack)" ecc.
+    m2 = re.findall(r"\((\d{5,})\s*kg\s*pack\)", text or "", re.I)
+    if m2:
+        seen=set(); out=[]
+        for x in m2:
+            if x not in seen:
+                seen.add(x); out.append(x)
+        return ", ".join(out[:3])
+    return ""
+def material_family_from(text: str) -> str:
+    families = [
+        "Monolayer HDPE","Polypropylene (PP)","Paper","Flexible Film – Mono non Metallized",
+        "Flexible - Label PSL WGL IML HTL","Rigid Paper – Corrugated Case",
+        "Inks and solvents","Hot melt adhesive","Wet Glue Label",
+        "Coated paper","Wood","Ink foil","Fasson PE 85 TOP White"
+    ]
+    t = text or ""
+    for fam in families:
+        if fam.lower() in t.lower():
+            return fam
+    # fallback su parole chiave comuni
+    if re.search(r"\bHDPE\b", t): return "Monolayer HDPE"
+    if re.search(r"\bPP\b|\bPolypropylene\b", t, re.I): return "Polypropylene (PP)"
+    if "corrugated" in t.lower(): return "Rigid Paper – Corrugated Case"
+    if "paper" in t.lower(): return "Paper"
     return ""
 def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
     full = "\n".join(pages or [""])
+    sku   = _first(full, SKU_RE)
+    title = _first(full, TITLE_RE)
+    cls   = _first(full, CLASS_RE)
+    cap   = capacity_from(title) or capacity_from(full)
     color = color_from(full)
     material = material_from(full)
     piece = piece_from(full, cls)
+    # nuove colonne (euristiche leggere)
+    comp  = component_from(full, piece, cls)
+    func  = function_from(full)
+    gcas  = material_ref_gcas_from(full)
+    mfam  = material_family_from(full)
+    return {
         "Piece": piece or "",
         "SKU": sku or "",
         "Title": title or "",
         "Material / Resin": material or "",
         "Class": cls or "",
         "Source File": source_name,
+        "Component": comp or "",
+        "Function": func or "",
+        "General description of the packaging": "",  # da riempire con regole quando ci dai esempi strutturati
+        "Material Ref GCAS": gcas or "",
+        "Material Family": mfam or ""
     }
+# ======================================================================
+# UI STREAMLIT
+# ======================================================================
+st.set_page_config(page_title="PDF → Table (OCR-ready)", layout="wide")
 st.title("📄→📊 PDF → Table (OCR-ready)")
+st.caption("Carica PDF (anche scansioni). Compilo la tabella con i campi richiesti; OCR come fallback.")
 with st.sidebar:
     files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
         if ocr_fallback and not any((p or "").strip() for p in pages):
             pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
         rec = parse_record(pages, up.name)
         rows.append(rec)
     except Exception as e:
         errors.append((up.name, str(e)))