martinofumagalli commited on
Commit
edee7ce
·
verified ·
1 Parent(s): 74b4e91

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -55
app.py CHANGED
@@ -95,7 +95,6 @@ def color_from(text: str) -> str:
95
  return (m.group(1).strip() if m else "")
96
 
97
  def material_from(text: str) -> str:
98
- # cattura righe con "RESIN" o frasi simili
99
  for line in (text or "").splitlines():
100
  if re.search(r"\bRESIN\b", line, re.I):
101
  return line.strip()
@@ -103,53 +102,38 @@ def material_from(text: str) -> str:
103
  return m.group(1).strip() if m else ""
104
 
105
  # ======================================================================
106
- # WEIGHT PARSER robusto su OCR (spazi tra cifre) e simboli ± varianti
107
- # Ritorna solo il valore es. "94±3g"
108
  # ======================================================================
109
- # numero con possibili spazi interni tra cifre (OCR): "9 4" -> 94
110
- NUM_SPACED = r"(?:\d(?:\s?\d){0,6}(?:[.,]\d+)?)"
111
- UNIT = r"(?:mg|g|kg)\b"
112
- PLUSMINUS = r"(?:±|\+\s*/?\s*-\s*|[+﹢]\s*[-\-])" # ±, +/-, +-, varianti
113
 
114
- WEIGHT_VALUE_RE = re.compile(
115
- rf"(?is)\bweight\b[^\n\r]{{0,120}}?({NUM_SPACED}\s*{PLUSMINUS}\s*{NUM_SPACED}\s*{UNIT}|{NUM_SPACED}\s*{UNIT})"
116
- )
117
-
118
- def _normalize_weight(s: str) -> str:
119
  s = (s or "").strip()
120
- # togli spazi solo tra cifre
 
 
121
  s = re.sub(r"(?<=\d)\s+(?=\d)", "", s)
122
- # uniforma simboli ±
123
- s = re.sub(r"\+\s*/?\s*-\s*", "±", s)
124
  s = s.replace("+-", "±").replace("﹢", "+").replace("-", "-")
125
- # togli spazi attorno a ± e prima dell'unità
126
  s = re.sub(r"\s*±\s*", "±", s)
127
- s = re.sub(r"\s*(mg|g|kg)\b", r"\1", s, flags=re.I)
128
- # virgole -> punti
 
129
  s = s.replace(",", ".")
130
  return s
131
 
132
  def weight_from(text: str) -> str:
133
  if not text:
134
  return ""
135
- m = WEIGHT_VALUE_RE.search(text)
136
- if m:
137
- return _normalize_weight(m.group(1))
138
- # Fallback riga-per-riga
139
- for line in (text or "").splitlines():
140
- if "weight" in line.lower():
141
- m2 = WEIGHT_VALUE_RE.search(line)
142
- if m2:
143
- return _normalize_weight(m2.group(1))
144
- # prova su substring dopo "weight"
145
- try:
146
- idx = line.lower().index("weight") + len("weight")
147
- m3 = re.search(rf"({NUM_SPACED}\s*{PLUSMINUS}\s*{NUM_SPACED}\s*{UNIT}|{NUM_SPACED}\s*{UNIT})", line[idx:], re.I)
148
- if m3:
149
- return _normalize_weight(m3.group(1))
150
- except Exception:
151
- pass
152
- return ""
153
 
154
  # --------------------- PIECE da "Packaging Component Type" ---------------------
155
  _ALLOWED_PIECES = {
@@ -176,37 +160,29 @@ def _normalize_piece(s: str) -> str:
176
  s2 = s1.lower()
177
  s2 = s2.replace("–", "-").replace("—", "-")
178
  s2 = s2.replace("label- ", "label ").replace(" -", " - ").strip()
179
- # prova match diretto
180
  if s2 in _ALLOWED_PIECES:
181
  return _ALLOWED_PIECES[s2]
182
- # prova alcune normalizzazioni
183
  s2 = s2.replace(" ", " ")
184
  if s2 in _ALLOWED_PIECES:
185
  return _ALLOWED_PIECES[s2]
186
- # fallback per frasi lunghe: cerca la keyword migliore
187
  for key, canon in _ALLOWED_PIECES.items():
188
  if key in s2:
189
  return canon
190
  return ""
191
 
192
  def piece_from(text: str, cls: str) -> str:
193
- # 1) Packaging Component Type (linea dedicata)
194
  m = _PACK_COMP_TYPE_RE.search(text or "")
195
  if m:
196
  val = m.group(1)
197
  normalized = _normalize_piece(val)
198
  if normalized:
199
  return normalized
200
-
201
- # 2) fallback legacy: Packaging Material Type
202
  m2 = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text or "", re.I)
203
  if m2:
204
  seg = m2.group(1)
205
  norm = _normalize_piece(seg)
206
  if norm:
207
  return norm
208
-
209
- # 3) fallback da Class
210
  if cls:
211
  norm = _normalize_piece(cls)
212
  if norm:
@@ -218,16 +194,14 @@ def piece_from(text: str, cls: str) -> str:
218
  if "corrugated" in cls.lower():
219
  return "container"
220
  if "label" in cls.lower():
221
- return "LABEL - BACK" # scelta neutra se non specificato
222
-
223
  return ""
224
 
225
- # --- Nuove colonne: euristiche base (si possono migliorare con esempi reali)
226
  FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
227
 
228
  def component_from(text: str, piece: str, cls: str) -> str:
229
  txt = text.lower()
230
- # priorità a keyword esplicite
231
  if "ink" in txt and "cartridge" in txt: return "Ink cartridge"
232
  if "ink foil" in txt: return "Ink foil"
233
  if "tape" in txt: return "Tape"
@@ -236,7 +210,6 @@ def component_from(text: str, piece: str, cls: str) -> str:
236
  if "cartonboard" in txt or "sheet" in txt: return "Cartonboard / Sheet"
237
  if "corrugated" in txt or "case" in txt or "outercase" in txt: return "Corrugated box"
238
  if "bundle" in txt: return "Bundle"
239
- # fallback da piece/class
240
  if piece: return piece
241
  if cls:
242
  if "bottle" in cls.lower(): return "Bottle"
@@ -250,7 +223,6 @@ def function_from(text: str) -> str:
250
  return m.group(1).title() if m else ""
251
 
252
  def material_ref_gcas_from(text: str) -> str:
253
- # codici tipo 8 cifre (es. 90082546) o due codici tra parentesi
254
  m = re.findall(r"\b(\d{7,9})\b", text or "")
255
  if m:
256
  seen = set(); out=[]
@@ -292,15 +264,15 @@ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
292
  cap = capacity_from(title) or capacity_from(full)
293
  color = color_from(full)
294
  material = material_from(full)
295
- piece = piece_from(full, cls) # <-- usa la nuova logica
296
 
297
- # nuove colonne (euristiche leggere)
298
  comp = component_from(full, piece, cls)
299
  func = function_from(full)
300
  gcas = material_ref_gcas_from(full)
301
  mfam = material_family_from(full)
302
 
303
- # estrai peso dal testo (se presente come testo digitale)
304
  wght = weight_from(full)
305
 
306
  return {
@@ -355,13 +327,13 @@ for up in files:
355
  raw = up.read()
356
  pages = extract_text_pages(raw)
357
 
358
- # Se il PDF non ha testo estraibile, fai OCR completo una sola volta
359
  if ocr_fallback and not any((p or "").strip() for p in pages):
360
  pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
361
 
362
  rec = parse_record(pages, up.name)
363
 
364
- # Se Weight è vuoto, OCR rapido sulle prime pagine e stop appena trovato
365
  if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
366
  w_ocr = run_ocr_for_weight(raw, lang=lang, tesseract_cmd=tess_cmd, max_pages=2, dpi_weight=200)
367
  if w_ocr:
 
95
  return (m.group(1).strip() if m else "")
96
 
97
  def material_from(text: str) -> str:
 
98
  for line in (text or "").splitlines():
99
  if re.search(r"\bRESIN\b", line, re.I):
100
  return line.strip()
 
102
  return m.group(1).strip() if m else ""
103
 
104
  # ======================================================================
105
+ # WEIGHT: prendi TUTTA la riga a partire da "Weight ..." e normalizza spazi/OCR
106
+ # Esempio: "Weight 9 4 +/- 3 g" -> "Weight 94±3g"
107
  # ======================================================================
108
+ WEIGHT_LINE_RE = re.compile(r"(?is)\bweight\b[^\n\r]*")
 
 
 
109
 
110
+ def _normalize_weight_line(s: str) -> str:
 
 
 
 
111
  s = (s or "").strip()
112
+ # comprimi spazi ripetuti
113
+ s = re.sub(r"\s+", " ", s)
114
+ # togli spazi interni tra cifre (OCR: "9 4" -> "94")
115
  s = re.sub(r"(?<=\d)\s+(?=\d)", "", s)
116
+ # unifica simboli ±
117
+ s = re.sub(r"\+\s*/\s*-\s*|\+\s*-\s*", "±", s)
118
  s = s.replace("+-", "±").replace("﹢", "+").replace("-", "-")
119
+ # rimuovi spazi attorno a ±
120
  s = re.sub(r"\s*±\s*", "±", s)
121
+ # rimuovi spazi prima dell'unità
122
+ s = re.sub(r"\s+(?=(?:mg|g|kg)\b)", "", s, flags=re.I)
123
+ # punti/virgole
124
  s = s.replace(",", ".")
125
  return s
126
 
127
  def weight_from(text: str) -> str:
128
  if not text:
129
  return ""
130
+ # preferisci la prima riga che contiene anche l'unità
131
+ lines = [m.group(0) for m in WEIGHT_LINE_RE.finditer(text)]
132
+ for ln in lines:
133
+ if re.search(r"\b(?:mg|g|kg)\b", ln, re.I):
134
+ return _normalize_weight_line(ln)
135
+ # se non trovata unità, restituisci comunque la prima occorrenza normalizzata
136
+ return _normalize_weight_line(lines[0]) if lines else ""
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  # --------------------- PIECE da "Packaging Component Type" ---------------------
139
  _ALLOWED_PIECES = {
 
160
  s2 = s1.lower()
161
  s2 = s2.replace("–", "-").replace("—", "-")
162
  s2 = s2.replace("label- ", "label ").replace(" -", " - ").strip()
 
163
  if s2 in _ALLOWED_PIECES:
164
  return _ALLOWED_PIECES[s2]
 
165
  s2 = s2.replace(" ", " ")
166
  if s2 in _ALLOWED_PIECES:
167
  return _ALLOWED_PIECES[s2]
 
168
  for key, canon in _ALLOWED_PIECES.items():
169
  if key in s2:
170
  return canon
171
  return ""
172
 
173
  def piece_from(text: str, cls: str) -> str:
 
174
  m = _PACK_COMP_TYPE_RE.search(text or "")
175
  if m:
176
  val = m.group(1)
177
  normalized = _normalize_piece(val)
178
  if normalized:
179
  return normalized
 
 
180
  m2 = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text or "", re.I)
181
  if m2:
182
  seg = m2.group(1)
183
  norm = _normalize_piece(seg)
184
  if norm:
185
  return norm
 
 
186
  if cls:
187
  norm = _normalize_piece(cls)
188
  if norm:
 
194
  if "corrugated" in cls.lower():
195
  return "container"
196
  if "label" in cls.lower():
197
+ return "LABEL - BACK"
 
198
  return ""
199
 
200
+ # --- Nuove colonne: euristiche base
201
  FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
202
 
203
  def component_from(text: str, piece: str, cls: str) -> str:
204
  txt = text.lower()
 
205
  if "ink" in txt and "cartridge" in txt: return "Ink cartridge"
206
  if "ink foil" in txt: return "Ink foil"
207
  if "tape" in txt: return "Tape"
 
210
  if "cartonboard" in txt or "sheet" in txt: return "Cartonboard / Sheet"
211
  if "corrugated" in txt or "case" in txt or "outercase" in txt: return "Corrugated box"
212
  if "bundle" in txt: return "Bundle"
 
213
  if piece: return piece
214
  if cls:
215
  if "bottle" in cls.lower(): return "Bottle"
 
223
  return m.group(1).title() if m else ""
224
 
225
  def material_ref_gcas_from(text: str) -> str:
 
226
  m = re.findall(r"\b(\d{7,9})\b", text or "")
227
  if m:
228
  seen = set(); out=[]
 
264
  cap = capacity_from(title) or capacity_from(full)
265
  color = color_from(full)
266
  material = material_from(full)
267
+ piece = piece_from(full, cls)
268
 
269
+ # nuove colonne
270
  comp = component_from(full, piece, cls)
271
  func = function_from(full)
272
  gcas = material_ref_gcas_from(full)
273
  mfam = material_family_from(full)
274
 
275
+ # WEIGHT: prendi l'intera riga "Weight ..."
276
  wght = weight_from(full)
277
 
278
  return {
 
327
  raw = up.read()
328
  pages = extract_text_pages(raw)
329
 
330
+ # Se il PDF non ha testo estraibile, OCR completo una sola volta
331
  if ocr_fallback and not any((p or "").strip() for p in pages):
332
  pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
333
 
334
  rec = parse_record(pages, up.name)
335
 
336
+ # Se Weight è vuoto, OCR rapido (prime pagine) e stop appena trovato
337
  if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
338
  w_ocr = run_ocr_for_weight(raw, lang=lang, tesseract_cmd=tess_cmd, max_pages=2, dpi_weight=200)
339
  if w_ocr: