Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -95,7 +95,6 @@ def color_from(text: str) -> str:
|
|
| 95 |
return (m.group(1).strip() if m else "")
|
| 96 |
|
| 97 |
def material_from(text: str) -> str:
|
| 98 |
-
# cattura righe con "RESIN" o frasi simili
|
| 99 |
for line in (text or "").splitlines():
|
| 100 |
if re.search(r"\bRESIN\b", line, re.I):
|
| 101 |
return line.strip()
|
|
@@ -103,53 +102,38 @@ def material_from(text: str) -> str:
|
|
| 103 |
return m.group(1).strip() if m else ""
|
| 104 |
|
| 105 |
# ======================================================================
|
| 106 |
-
# WEIGHT
|
| 107 |
-
#
|
| 108 |
# ======================================================================
|
| 109 |
-
|
| 110 |
-
NUM_SPACED = r"(?:\d(?:\s?\d){0,6}(?:[.,]\d+)?)"
|
| 111 |
-
UNIT = r"(?:mg|g|kg)\b"
|
| 112 |
-
PLUSMINUS = r"(?:±|\+\s*/?\s*-\s*|[+﹢]\s*[-\-])" # ±, +/-, +-, varianti
|
| 113 |
|
| 114 |
-
|
| 115 |
-
rf"(?is)\bweight\b[^\n\r]{{0,120}}?({NUM_SPACED}\s*{PLUSMINUS}\s*{NUM_SPACED}\s*{UNIT}|{NUM_SPACED}\s*{UNIT})"
|
| 116 |
-
)
|
| 117 |
-
|
| 118 |
-
def _normalize_weight(s: str) -> str:
|
| 119 |
s = (s or "").strip()
|
| 120 |
-
#
|
|
|
|
|
|
|
| 121 |
s = re.sub(r"(?<=\d)\s+(?=\d)", "", s)
|
| 122 |
-
#
|
| 123 |
-
s = re.sub(r"\+\s*/
|
| 124 |
s = s.replace("+-", "±").replace("﹢", "+").replace("-", "-")
|
| 125 |
-
#
|
| 126 |
s = re.sub(r"\s*±\s*", "±", s)
|
| 127 |
-
|
| 128 |
-
|
|
|
|
| 129 |
s = s.replace(",", ".")
|
| 130 |
return s
|
| 131 |
|
| 132 |
def weight_from(text: str) -> str:
|
| 133 |
if not text:
|
| 134 |
return ""
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
if m2:
|
| 143 |
-
return _normalize_weight(m2.group(1))
|
| 144 |
-
# prova su substring dopo "weight"
|
| 145 |
-
try:
|
| 146 |
-
idx = line.lower().index("weight") + len("weight")
|
| 147 |
-
m3 = re.search(rf"({NUM_SPACED}\s*{PLUSMINUS}\s*{NUM_SPACED}\s*{UNIT}|{NUM_SPACED}\s*{UNIT})", line[idx:], re.I)
|
| 148 |
-
if m3:
|
| 149 |
-
return _normalize_weight(m3.group(1))
|
| 150 |
-
except Exception:
|
| 151 |
-
pass
|
| 152 |
-
return ""
|
| 153 |
|
| 154 |
# --------------------- PIECE da "Packaging Component Type" ---------------------
|
| 155 |
_ALLOWED_PIECES = {
|
|
@@ -176,37 +160,29 @@ def _normalize_piece(s: str) -> str:
|
|
| 176 |
s2 = s1.lower()
|
| 177 |
s2 = s2.replace("–", "-").replace("—", "-")
|
| 178 |
s2 = s2.replace("label- ", "label ").replace(" -", " - ").strip()
|
| 179 |
-
# prova match diretto
|
| 180 |
if s2 in _ALLOWED_PIECES:
|
| 181 |
return _ALLOWED_PIECES[s2]
|
| 182 |
-
# prova alcune normalizzazioni
|
| 183 |
s2 = s2.replace(" ", " ")
|
| 184 |
if s2 in _ALLOWED_PIECES:
|
| 185 |
return _ALLOWED_PIECES[s2]
|
| 186 |
-
# fallback per frasi lunghe: cerca la keyword migliore
|
| 187 |
for key, canon in _ALLOWED_PIECES.items():
|
| 188 |
if key in s2:
|
| 189 |
return canon
|
| 190 |
return ""
|
| 191 |
|
| 192 |
def piece_from(text: str, cls: str) -> str:
|
| 193 |
-
# 1) Packaging Component Type (linea dedicata)
|
| 194 |
m = _PACK_COMP_TYPE_RE.search(text or "")
|
| 195 |
if m:
|
| 196 |
val = m.group(1)
|
| 197 |
normalized = _normalize_piece(val)
|
| 198 |
if normalized:
|
| 199 |
return normalized
|
| 200 |
-
|
| 201 |
-
# 2) fallback legacy: Packaging Material Type
|
| 202 |
m2 = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text or "", re.I)
|
| 203 |
if m2:
|
| 204 |
seg = m2.group(1)
|
| 205 |
norm = _normalize_piece(seg)
|
| 206 |
if norm:
|
| 207 |
return norm
|
| 208 |
-
|
| 209 |
-
# 3) fallback da Class
|
| 210 |
if cls:
|
| 211 |
norm = _normalize_piece(cls)
|
| 212 |
if norm:
|
|
@@ -218,16 +194,14 @@ def piece_from(text: str, cls: str) -> str:
|
|
| 218 |
if "corrugated" in cls.lower():
|
| 219 |
return "container"
|
| 220 |
if "label" in cls.lower():
|
| 221 |
-
return "LABEL - BACK"
|
| 222 |
-
|
| 223 |
return ""
|
| 224 |
|
| 225 |
-
# --- Nuove colonne: euristiche base
|
| 226 |
FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
|
| 227 |
|
| 228 |
def component_from(text: str, piece: str, cls: str) -> str:
|
| 229 |
txt = text.lower()
|
| 230 |
-
# priorità a keyword esplicite
|
| 231 |
if "ink" in txt and "cartridge" in txt: return "Ink cartridge"
|
| 232 |
if "ink foil" in txt: return "Ink foil"
|
| 233 |
if "tape" in txt: return "Tape"
|
|
@@ -236,7 +210,6 @@ def component_from(text: str, piece: str, cls: str) -> str:
|
|
| 236 |
if "cartonboard" in txt or "sheet" in txt: return "Cartonboard / Sheet"
|
| 237 |
if "corrugated" in txt or "case" in txt or "outercase" in txt: return "Corrugated box"
|
| 238 |
if "bundle" in txt: return "Bundle"
|
| 239 |
-
# fallback da piece/class
|
| 240 |
if piece: return piece
|
| 241 |
if cls:
|
| 242 |
if "bottle" in cls.lower(): return "Bottle"
|
|
@@ -250,7 +223,6 @@ def function_from(text: str) -> str:
|
|
| 250 |
return m.group(1).title() if m else ""
|
| 251 |
|
| 252 |
def material_ref_gcas_from(text: str) -> str:
|
| 253 |
-
# codici tipo 8 cifre (es. 90082546) o due codici tra parentesi
|
| 254 |
m = re.findall(r"\b(\d{7,9})\b", text or "")
|
| 255 |
if m:
|
| 256 |
seen = set(); out=[]
|
|
@@ -292,15 +264,15 @@ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
|
|
| 292 |
cap = capacity_from(title) or capacity_from(full)
|
| 293 |
color = color_from(full)
|
| 294 |
material = material_from(full)
|
| 295 |
-
piece = piece_from(full, cls)
|
| 296 |
|
| 297 |
-
# nuove colonne
|
| 298 |
comp = component_from(full, piece, cls)
|
| 299 |
func = function_from(full)
|
| 300 |
gcas = material_ref_gcas_from(full)
|
| 301 |
mfam = material_family_from(full)
|
| 302 |
|
| 303 |
-
#
|
| 304 |
wght = weight_from(full)
|
| 305 |
|
| 306 |
return {
|
|
@@ -355,13 +327,13 @@ for up in files:
|
|
| 355 |
raw = up.read()
|
| 356 |
pages = extract_text_pages(raw)
|
| 357 |
|
| 358 |
-
# Se il PDF non ha testo estraibile,
|
| 359 |
if ocr_fallback and not any((p or "").strip() for p in pages):
|
| 360 |
pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
|
| 361 |
|
| 362 |
rec = parse_record(pages, up.name)
|
| 363 |
|
| 364 |
-
# Se Weight è vuoto, OCR rapido
|
| 365 |
if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
|
| 366 |
w_ocr = run_ocr_for_weight(raw, lang=lang, tesseract_cmd=tess_cmd, max_pages=2, dpi_weight=200)
|
| 367 |
if w_ocr:
|
|
|
|
| 95 |
return (m.group(1).strip() if m else "")
|
| 96 |
|
| 97 |
def material_from(text: str) -> str:
|
|
|
|
| 98 |
for line in (text or "").splitlines():
|
| 99 |
if re.search(r"\bRESIN\b", line, re.I):
|
| 100 |
return line.strip()
|
|
|
|
| 102 |
return m.group(1).strip() if m else ""
|
| 103 |
|
| 104 |
# ======================================================================
|
| 105 |
+
# WEIGHT: prendi TUTTA la riga a partire da "Weight ..." e normalizza spazi/OCR
|
| 106 |
+
# Esempio: "Weight 9 4 +/- 3 g" -> "Weight 94±3g"
|
| 107 |
# ======================================================================
|
| 108 |
+
WEIGHT_LINE_RE = re.compile(r"(?is)\bweight\b[^\n\r]*")
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
def _normalize_weight_line(s: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
s = (s or "").strip()
|
| 112 |
+
# comprimi spazi ripetuti
|
| 113 |
+
s = re.sub(r"\s+", " ", s)
|
| 114 |
+
# togli spazi interni tra cifre (OCR: "9 4" -> "94")
|
| 115 |
s = re.sub(r"(?<=\d)\s+(?=\d)", "", s)
|
| 116 |
+
# unifica simboli ±
|
| 117 |
+
s = re.sub(r"\+\s*/\s*-\s*|\+\s*-\s*", "±", s)
|
| 118 |
s = s.replace("+-", "±").replace("﹢", "+").replace("-", "-")
|
| 119 |
+
# rimuovi spazi attorno a ±
|
| 120 |
s = re.sub(r"\s*±\s*", "±", s)
|
| 121 |
+
# rimuovi spazi prima dell'unità
|
| 122 |
+
s = re.sub(r"\s+(?=(?:mg|g|kg)\b)", "", s, flags=re.I)
|
| 123 |
+
# punti/virgole
|
| 124 |
s = s.replace(",", ".")
|
| 125 |
return s
|
| 126 |
|
| 127 |
def weight_from(text: str) -> str:
|
| 128 |
if not text:
|
| 129 |
return ""
|
| 130 |
+
# preferisci la prima riga che contiene anche l'unità
|
| 131 |
+
lines = [m.group(0) for m in WEIGHT_LINE_RE.finditer(text)]
|
| 132 |
+
for ln in lines:
|
| 133 |
+
if re.search(r"\b(?:mg|g|kg)\b", ln, re.I):
|
| 134 |
+
return _normalize_weight_line(ln)
|
| 135 |
+
# se non trovata unità, restituisci comunque la prima occorrenza normalizzata
|
| 136 |
+
return _normalize_weight_line(lines[0]) if lines else ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
# --------------------- PIECE da "Packaging Component Type" ---------------------
|
| 139 |
_ALLOWED_PIECES = {
|
|
|
|
| 160 |
s2 = s1.lower()
|
| 161 |
s2 = s2.replace("–", "-").replace("—", "-")
|
| 162 |
s2 = s2.replace("label- ", "label ").replace(" -", " - ").strip()
|
|
|
|
| 163 |
if s2 in _ALLOWED_PIECES:
|
| 164 |
return _ALLOWED_PIECES[s2]
|
|
|
|
| 165 |
s2 = s2.replace(" ", " ")
|
| 166 |
if s2 in _ALLOWED_PIECES:
|
| 167 |
return _ALLOWED_PIECES[s2]
|
|
|
|
| 168 |
for key, canon in _ALLOWED_PIECES.items():
|
| 169 |
if key in s2:
|
| 170 |
return canon
|
| 171 |
return ""
|
| 172 |
|
| 173 |
def piece_from(text: str, cls: str) -> str:
|
|
|
|
| 174 |
m = _PACK_COMP_TYPE_RE.search(text or "")
|
| 175 |
if m:
|
| 176 |
val = m.group(1)
|
| 177 |
normalized = _normalize_piece(val)
|
| 178 |
if normalized:
|
| 179 |
return normalized
|
|
|
|
|
|
|
| 180 |
m2 = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text or "", re.I)
|
| 181 |
if m2:
|
| 182 |
seg = m2.group(1)
|
| 183 |
norm = _normalize_piece(seg)
|
| 184 |
if norm:
|
| 185 |
return norm
|
|
|
|
|
|
|
| 186 |
if cls:
|
| 187 |
norm = _normalize_piece(cls)
|
| 188 |
if norm:
|
|
|
|
| 194 |
if "corrugated" in cls.lower():
|
| 195 |
return "container"
|
| 196 |
if "label" in cls.lower():
|
| 197 |
+
return "LABEL - BACK"
|
|
|
|
| 198 |
return ""
|
| 199 |
|
| 200 |
+
# --- Nuove colonne: euristiche base
|
| 201 |
FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
|
| 202 |
|
| 203 |
def component_from(text: str, piece: str, cls: str) -> str:
|
| 204 |
txt = text.lower()
|
|
|
|
| 205 |
if "ink" in txt and "cartridge" in txt: return "Ink cartridge"
|
| 206 |
if "ink foil" in txt: return "Ink foil"
|
| 207 |
if "tape" in txt: return "Tape"
|
|
|
|
| 210 |
if "cartonboard" in txt or "sheet" in txt: return "Cartonboard / Sheet"
|
| 211 |
if "corrugated" in txt or "case" in txt or "outercase" in txt: return "Corrugated box"
|
| 212 |
if "bundle" in txt: return "Bundle"
|
|
|
|
| 213 |
if piece: return piece
|
| 214 |
if cls:
|
| 215 |
if "bottle" in cls.lower(): return "Bottle"
|
|
|
|
| 223 |
return m.group(1).title() if m else ""
|
| 224 |
|
| 225 |
def material_ref_gcas_from(text: str) -> str:
|
|
|
|
| 226 |
m = re.findall(r"\b(\d{7,9})\b", text or "")
|
| 227 |
if m:
|
| 228 |
seen = set(); out=[]
|
|
|
|
| 264 |
cap = capacity_from(title) or capacity_from(full)
|
| 265 |
color = color_from(full)
|
| 266 |
material = material_from(full)
|
| 267 |
+
piece = piece_from(full, cls)
|
| 268 |
|
| 269 |
+
# nuove colonne
|
| 270 |
comp = component_from(full, piece, cls)
|
| 271 |
func = function_from(full)
|
| 272 |
gcas = material_ref_gcas_from(full)
|
| 273 |
mfam = material_family_from(full)
|
| 274 |
|
| 275 |
+
# WEIGHT: prendi l'intera riga "Weight ..."
|
| 276 |
wght = weight_from(full)
|
| 277 |
|
| 278 |
return {
|
|
|
|
| 327 |
raw = up.read()
|
| 328 |
pages = extract_text_pages(raw)
|
| 329 |
|
| 330 |
+
# Se il PDF non ha testo estraibile, OCR completo una sola volta
|
| 331 |
if ocr_fallback and not any((p or "").strip() for p in pages):
|
| 332 |
pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
|
| 333 |
|
| 334 |
rec = parse_record(pages, up.name)
|
| 335 |
|
| 336 |
+
# Se Weight è vuoto, OCR rapido (prime pagine) e stop appena trovato
|
| 337 |
if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
|
| 338 |
w_ocr = run_ocr_for_weight(raw, lang=lang, tesseract_cmd=tess_cmd, max_pages=2, dpi_weight=200)
|
| 339 |
if w_ocr:
|