martinofumagalli commited on
Commit
97d6f99
·
verified ·
1 Parent(s): 7e449bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -32
app.py CHANGED
@@ -85,47 +85,48 @@ def material_from(text: str) -> str:
85
  m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
86
  return m.group(1).strip() if m else ""
87
 
88
- # --- (AGGIUNTA) WEIGHT PARSER ----------------------------------------------
89
- WEIGHT_TOL_RE = re.compile(
90
- r"\bWeight\b[^\n\r]{0,15}?([0-9]+(?:[.,][0-9]+)?)\s*(?:±|\+/?-|\+-)\s*([0-9]+(?:[.,][0-9]+)?)\s*(mg|g|kg)?",
91
- re.I,
92
- )
93
- WEIGHT_SIMPLE_RE = re.compile(
94
- r"\bWeight\b[^\n\r]{0,15}?([0-9]+(?:[.,][0-9]+)?)\s*(mg|g|kg)\b",
95
- re.I,
96
- )
97
- WEIGHT_INLINE_RE = re.compile(
98
- r"\b([0-9]+(?:[.,][0-9]+)?)\s*(?:±|\+/?-|\+-)\s*([0-9]+(?:[.,][0-9]+)?)\s*(mg|g|kg)\b",
99
- re.I,
 
 
100
  )
101
 
102
- def _norm_num(s: str) -> str:
103
- return (s or "").replace(",", ".").strip().rstrip(".")
 
 
 
 
 
104
 
105
  def weight_from(text: str) -> str:
106
- # 1) match con tolleranza
107
- m = WEIGHT_TOL_RE.search(text or "")
108
- if m:
109
- val = _norm_num(m.group(1))
110
- tol = _norm_num(m.group(2))
111
- unit = (m.group(3) or "g").lower()
112
- return f"{val} ± {tol} {unit}"
113
- # 2) match semplice con unità
114
- m = WEIGHT_SIMPLE_RE.search(text or "")
115
  if m:
116
- val = _norm_num(m.group(1))
117
- unit = (m.group(2) or "g").lower()
118
- return f"{val} {unit}"
119
- # 3) riga per riga per casi OCR
120
  for line in (text or "").splitlines():
121
  if "weight" in line.lower():
122
- m2 = WEIGHT_INLINE_RE.search(line)
 
 
 
123
  if m2:
124
- val = _norm_num(m2.group(1))
125
- tol = _norm_num(m2.group(2))
126
- unit = (m2.group(3) or "g").lower()
127
- return f"{val} ± {tol} {unit}"
128
  return ""
 
129
  # ---------------------------------------------------------------------------
130
 
131
  # --------------------- AGGIUNTA RICHIESTA: PIECE da "Packaging Component Type" ---------------------
@@ -339,6 +340,13 @@ for up in files:
339
  if ocr_fallback and not any((p or "").strip() for p in pages):
340
  pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
341
  rec = parse_record(pages, up.name)
 
 
 
 
 
 
 
342
  rows.append(rec)
343
  except Exception as e:
344
  errors.append((up.name, str(e)))
 
85
  m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
86
  return m.group(1).strip() if m else ""
87
 
88
+ # --- WEIGHT PARSER → restituisce solo il valore, es. "94±3g" -------------
89
+ WEIGHT_VALUE_RE = re.compile(
90
+ r"""(?ix)
91
+ \bweight\b
92
+ [^\n\r]{0,80}? # pochi caratteri sulla stessa riga
93
+ (
94
+ (?:\d+(?:[.,]\d+)?\s* # valore principale (94 o 94,5)
95
+ (?:±|\+/?-|\+-)\s* # simbolo tolleranza (±, +/- o +-)
96
+ \d+(?:[.,]\d+)?\s* # tolleranza (3 o 3,0)
97
+ (?:mg|g|kg)) # unità
98
+ |
99
+ (?:\d+(?:[.,]\d+)?\s*(?:mg|g|kg)) # fallback: solo "94 g"
100
+ )
101
+ """,
102
  )
103
 
104
+ def _normalize_weight(s: str) -> str:
105
+ # compatta spazi, uniforma simboli/virgole → "94±3g"
106
+ s = (s or "").strip()
107
+ s = s.replace(" ", "")
108
+ s = s.replace("+/-", "±").replace("+-", "±")
109
+ s = s.replace(",", ".")
110
+ return s
111
 
112
  def weight_from(text: str) -> str:
113
+ if not text:
114
+ return ""
115
+ # 1) match diretto nel testo complessivo
116
+ m = WEIGHT_VALUE_RE.search(text)
 
 
 
 
 
117
  if m:
118
+ return _normalize_weight(m.group(1))
119
+ # 2) fallback riga-per-riga per OCR “sporco”
 
 
120
  for line in (text or "").splitlines():
121
  if "weight" in line.lower():
122
+ m2 = re.search(
123
+ r"(?ix)\bweight\b[^\n\r]*?((?:\d+(?:[.,]\d+)?\s*(?:±|\+/?-|\+-)\s*\d+(?:[.,]\d+)?\s*(?:mg|g|kg))|(?:\d+(?:[.,]\d+)?\s*(?:mg|g|kg)))",
124
+ line,
125
+ )
126
  if m2:
127
+ return _normalize_weight(m2.group(1))
 
 
 
128
  return ""
129
+
130
  # ---------------------------------------------------------------------------
131
 
132
  # --------------------- AGGIUNTA RICHIESTA: PIECE da "Packaging Component Type" ---------------------
 
340
  if ocr_fallback and not any((p or "").strip() for p in pages):
341
  pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
342
  rec = parse_record(pages, up.name)
343
+ # Se Weight è vuoto, prova un pass OCR dedicato solo per il peso
344
+ if (not rec.get("Weight") or rec["Weight"] == "–") and ocr_fallback:
345
+ ocr_pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
346
+ w_ocr = weight_from("\n".join(ocr_pages))
347
+ if w_ocr:
348
+ rec["Weight"] = w_ocr
349
+
350
  rows.append(rec)
351
  except Exception as e:
352
  errors.append((up.name, str(e)))