Spaces:

TuttiQuantis
/

PPWR_APP

Sleeping

App Files Files Community

martinofumagalli commited on Nov 10, 2025

Commit

edee7ce

verified ·

1 Parent(s): 74b4e91

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -55

app.py CHANGED Viewed

@@ -95,7 +95,6 @@ def color_from(text: str) -> str:
     return (m.group(1).strip() if m else "")
 def material_from(text: str) -> str:
-    # cattura righe con "RESIN" o frasi simili
     for line in (text or "").splitlines():
         if re.search(r"\bRESIN\b", line, re.I):
             return line.strip()
@@ -103,53 +102,38 @@ def material_from(text: str) -> str:
     return m.group(1).strip() if m else ""
 # ======================================================================
-# WEIGHT PARSER → robusto su OCR (spazi tra cifre) e simboli ± varianti
-# Ritorna solo il valore es. "94±3g"
 # ======================================================================
-# numero con possibili spazi interni tra cifre (OCR): "9 4" -> 94
-NUM_SPACED = r"(?:\d(?:\s?\d){0,6}(?:[.,]\d+)?)"
-UNIT = r"(?:mg|g|kg)\b"
-PLUSMINUS = r"(?:±|\+\s*/?\s*-\s*|[＋﹢]\s*[－\-])"  # ±, +/-, +-, varianti
-WEIGHT_VALUE_RE = re.compile(
-    rf"(?is)\bweight\b[^\n\r]{{0,120}}?({NUM_SPACED}\s*{PLUSMINUS}\s*{NUM_SPACED}\s*{UNIT}|{NUM_SPACED}\s*{UNIT})"
-)
-def _normalize_weight(s: str) -> str:
     s = (s or "").strip()
-    # togli spazi solo tra cifre
     s = re.sub(r"(?<=\d)\s+(?=\d)", "", s)
-    # uniforma simboli ±
-    s = re.sub(r"\+\s*/?\s*-\s*", "±", s)
     s = s.replace("＋－", "±").replace("﹢", "+").replace("－", "-")
-    # togli spazi attorno a ± e prima dell'unità
     s = re.sub(r"\s*±\s*", "±", s)
-    s = re.sub(r"\s*(mg|g|kg)\b", r"\1", s, flags=re.I)
-    # virgole -> punti
     s = s.replace(",", ".")
     return s
 def weight_from(text: str) -> str:
     if not text:
         return ""
-    m = WEIGHT_VALUE_RE.search(text)
-    if m:
-        return _normalize_weight(m.group(1))
-    # Fallback riga-per-riga
-    for line in (text or "").splitlines():
-        if "weight" in line.lower():
-            m2 = WEIGHT_VALUE_RE.search(line)
-            if m2:
-                return _normalize_weight(m2.group(1))
-            # prova su substring dopo "weight"
-            try:
-                idx = line.lower().index("weight") + len("weight")
-                m3 = re.search(rf"({NUM_SPACED}\s*{PLUSMINUS}\s*{NUM_SPACED}\s*{UNIT}|{NUM_SPACED}\s*{UNIT})", line[idx:], re.I)
-                if m3:
-                    return _normalize_weight(m3.group(1))
-            except Exception:
-                pass
-    return ""
 # ---------------------  PIECE da "Packaging Component Type"  ---------------------
 _ALLOWED_PIECES = {
@@ -176,37 +160,29 @@ def _normalize_piece(s: str) -> str:
     s2 = s1.lower()
     s2 = s2.replace("–", "-").replace("—", "-")
     s2 = s2.replace("label- ", "label ").replace(" -", " - ").strip()
-    # prova match diretto
     if s2 in _ALLOWED_PIECES:
         return _ALLOWED_PIECES[s2]
-    # prova alcune normalizzazioni
     s2 = s2.replace("  ", " ")
     if s2 in _ALLOWED_PIECES:
         return _ALLOWED_PIECES[s2]
-    # fallback per frasi lunghe: cerca la keyword migliore
     for key, canon in _ALLOWED_PIECES.items():
         if key in s2:
             return canon
     return ""
 def piece_from(text: str, cls: str) -> str:
-    # 1) Packaging Component Type (linea dedicata)
     m = _PACK_COMP_TYPE_RE.search(text or "")
     if m:
         val = m.group(1)
         normalized = _normalize_piece(val)
         if normalized:
             return normalized
-    # 2) fallback legacy: Packaging Material Type
     m2 = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text or "", re.I)
     if m2:
         seg = m2.group(1)
         norm = _normalize_piece(seg)
         if norm:
             return norm
-    # 3) fallback da Class
     if cls:
         norm = _normalize_piece(cls)
         if norm:
@@ -218,16 +194,14 @@ def piece_from(text: str, cls: str) -> str:
         if "corrugated" in cls.lower():
             return "container"
         if "label" in cls.lower():
-            return "LABEL - BACK"  # scelta neutra se non specificato
     return ""
-# --- Nuove colonne: euristiche base (si possono migliorare con esempi reali)
 FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
 def component_from(text: str, piece: str, cls: str) -> str:
     txt = text.lower()
-    # priorità a keyword esplicite
     if "ink" in txt and "cartridge" in txt: return "Ink cartridge"
     if "ink foil" in txt: return "Ink foil"
     if "tape" in txt: return "Tape"
@@ -236,7 +210,6 @@ def component_from(text: str, piece: str, cls: str) -> str:
     if "cartonboard" in txt or "sheet" in txt: return "Cartonboard / Sheet"
     if "corrugated" in txt or "case" in txt or "outercase" in txt: return "Corrugated box"
     if "bundle" in txt: return "Bundle"
-    # fallback da piece/class
     if piece: return piece
     if cls:
         if "bottle" in cls.lower(): return "Bottle"
@@ -250,7 +223,6 @@ def function_from(text: str) -> str:
     return m.group(1).title() if m else ""
 def material_ref_gcas_from(text: str) -> str:
-    # codici tipo 8 cifre (es. 90082546) o due codici tra parentesi
     m = re.findall(r"\b(\d{7,9})\b", text or "")
     if m:
         seen = set(); out=[]
@@ -292,15 +264,15 @@ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
     cap   = capacity_from(title) or capacity_from(full)
     color = color_from(full)
     material = material_from(full)
-    piece = piece_from(full, cls)  # <-- usa la nuova logica
-    # nuove colonne (euristiche leggere)
     comp  = component_from(full, piece, cls)
     func  = function_from(full)
     gcas  = material_ref_gcas_from(full)
     mfam  = material_family_from(full)
-    # estrai peso dal testo (se presente come testo digitale)
     wght  = weight_from(full)
     return {
@@ -355,13 +327,13 @@ for up in files:
         raw = up.read()
         pages = extract_text_pages(raw)
-        # Se il PDF non ha testo estraibile, fai OCR completo una sola volta
         if ocr_fallback and not any((p or "").strip() for p in pages):
             pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
         rec = parse_record(pages, up.name)
-        # Se Weight è vuoto, OCR rapido sulle prime pagine e stop appena trovato
         if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
             w_ocr = run_ocr_for_weight(raw, lang=lang, tesseract_cmd=tess_cmd, max_pages=2, dpi_weight=200)
             if w_ocr:

     return (m.group(1).strip() if m else "")
 def material_from(text: str) -> str:
     for line in (text or "").splitlines():
         if re.search(r"\bRESIN\b", line, re.I):
             return line.strip()
     return m.group(1).strip() if m else ""
 # ======================================================================
+# WEIGHT: prendi TUTTA la riga a partire da "Weight ..." e normalizza spazi/OCR
+# Esempio: "Weight 9 4  +/-  3  g" -> "Weight 94±3g"
 # ======================================================================
+WEIGHT_LINE_RE = re.compile(r"(?is)\bweight\b[^\n\r]*")
+def _normalize_weight_line(s: str) -> str:
     s = (s or "").strip()
+    # comprimi spazi ripetuti
+    s = re.sub(r"\s+", " ", s)
+    # togli spazi interni tra cifre (OCR: "9 4" -> "94")
     s = re.sub(r"(?<=\d)\s+(?=\d)", "", s)
+    # unifica simboli ±
+    s = re.sub(r"\+\s*/\s*-\s*|\+\s*-\s*", "±", s)
     s = s.replace("＋－", "±").replace("﹢", "+").replace("－", "-")
+    # rimuovi spazi attorno a ±
     s = re.sub(r"\s*±\s*", "±", s)
+    # rimuovi spazi prima dell'unità
+    s = re.sub(r"\s+(?=(?:mg|g|kg)\b)", "", s, flags=re.I)
+    # punti/virgole
     s = s.replace(",", ".")
     return s
 def weight_from(text: str) -> str:
     if not text:
         return ""
+    # preferisci la prima riga che contiene anche l'unità
+    lines = [m.group(0) for m in WEIGHT_LINE_RE.finditer(text)]
+    for ln in lines:
+        if re.search(r"\b(?:mg|g|kg)\b", ln, re.I):
+            return _normalize_weight_line(ln)
+    # se non trovata unità, restituisci comunque la prima occorrenza normalizzata
+    return _normalize_weight_line(lines[0]) if lines else ""
 # ---------------------  PIECE da "Packaging Component Type"  ---------------------
 _ALLOWED_PIECES = {
     s2 = s1.lower()
     s2 = s2.replace("–", "-").replace("—", "-")
     s2 = s2.replace("label- ", "label ").replace(" -", " - ").strip()
     if s2 in _ALLOWED_PIECES:
         return _ALLOWED_PIECES[s2]
     s2 = s2.replace("  ", " ")
     if s2 in _ALLOWED_PIECES:
         return _ALLOWED_PIECES[s2]
     for key, canon in _ALLOWED_PIECES.items():
         if key in s2:
             return canon
     return ""
 def piece_from(text: str, cls: str) -> str:
     m = _PACK_COMP_TYPE_RE.search(text or "")
     if m:
         val = m.group(1)
         normalized = _normalize_piece(val)
         if normalized:
             return normalized
     m2 = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text or "", re.I)
     if m2:
         seg = m2.group(1)
         norm = _normalize_piece(seg)
         if norm:
             return norm
     if cls:
         norm = _normalize_piece(cls)
         if norm:
         if "corrugated" in cls.lower():
             return "container"
         if "label" in cls.lower():
+            return "LABEL - BACK"
     return ""
+# --- Nuove colonne: euristiche base
 FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
 def component_from(text: str, piece: str, cls: str) -> str:
     txt = text.lower()
     if "ink" in txt and "cartridge" in txt: return "Ink cartridge"
     if "ink foil" in txt: return "Ink foil"
     if "tape" in txt: return "Tape"
     if "cartonboard" in txt or "sheet" in txt: return "Cartonboard / Sheet"
     if "corrugated" in txt or "case" in txt or "outercase" in txt: return "Corrugated box"
     if "bundle" in txt: return "Bundle"
     if piece: return piece
     if cls:
         if "bottle" in cls.lower(): return "Bottle"
     return m.group(1).title() if m else ""
 def material_ref_gcas_from(text: str) -> str:
     m = re.findall(r"\b(\d{7,9})\b", text or "")
     if m:
         seen = set(); out=[]
     cap   = capacity_from(title) or capacity_from(full)
     color = color_from(full)
     material = material_from(full)
+    piece = piece_from(full, cls)
+    # nuove colonne
     comp  = component_from(full, piece, cls)
     func  = function_from(full)
     gcas  = material_ref_gcas_from(full)
     mfam  = material_family_from(full)
+    # WEIGHT: prendi l'intera riga "Weight ..."
     wght  = weight_from(full)
     return {
         raw = up.read()
         pages = extract_text_pages(raw)
+        # Se il PDF non ha testo estraibile, OCR completo una sola volta
         if ocr_fallback and not any((p or "").strip() for p in pages):
             pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
         rec = parse_record(pages, up.name)
+        # Se Weight è vuoto, OCR rapido (prime pagine) e stop appena trovato
         if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
             w_ocr = run_ocr_for_weight(raw, lang=lang, tesseract_cmd=tess_cmd, max_pages=2, dpi_weight=200)
             if w_ocr: