martinofumagalli commited on
Commit
74b4e91
·
verified ·
1 Parent(s): 36cf4fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -23
app.py CHANGED
@@ -102,46 +102,53 @@ def material_from(text: str) -> str:
102
  m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
103
  return m.group(1).strip() if m else ""
104
 
105
- # --- WEIGHT PARSER → restituisce solo il valore, es. "94±3g"
 
 
 
 
 
 
 
 
106
  WEIGHT_VALUE_RE = re.compile(
107
- r"""(?ix)
108
- \bweight\b
109
- [^\n\r]{0,80}? # pochi caratteri sulla stessa riga
110
- (
111
- (?:\d+(?:[.,]\d+)?\s* # valore principale (94 o 94,5)
112
- (?:±|\+/?-|\+-)\s* # simbolo tolleranza (±, +/- o +-)
113
- \d+(?:[.,]\d+)?\s* # tolleranza (3 o 3,0)
114
- (?:mg|g|kg)) # unità
115
- |
116
- (?:\d+(?:[.,]\d+)?\s*(?:mg|g|kg)) # fallback: solo "94 g"
117
- )
118
- """,
119
  )
120
 
121
  def _normalize_weight(s: str) -> str:
122
- # compatta spazi, uniforma simboli/virgole → "94±3g"
123
  s = (s or "").strip()
124
- s = s.replace(" ", "")
125
- s = s.replace("+/-", "±").replace("+-", "±")
 
 
 
 
 
 
 
126
  s = s.replace(",", ".")
127
  return s
128
 
129
  def weight_from(text: str) -> str:
130
  if not text:
131
  return ""
132
- # 1) match diretto nel testo complessivo
133
  m = WEIGHT_VALUE_RE.search(text)
134
  if m:
135
  return _normalize_weight(m.group(1))
136
- # 2) fallback riga-per-riga per OCR “sporco”
137
  for line in (text or "").splitlines():
138
  if "weight" in line.lower():
139
- m2 = re.search(
140
- r"(?ix)\bweight\b[^\n\r]*?((?:\d+(?:[.,]\d+)?\s*(?:±|\+/?-|\+-)\s*\d+(?:[.,]\d+)?\s*(?:mg|g|kg))|(?:\d+(?:[.,]\d+)?\s*(?:mg|g|kg)))",
141
- line,
142
- )
143
  if m2:
144
  return _normalize_weight(m2.group(1))
 
 
 
 
 
 
 
 
145
  return ""
146
 
147
  # --------------------- PIECE da "Packaging Component Type" ---------------------
@@ -214,7 +221,6 @@ def piece_from(text: str, cls: str) -> str:
214
  return "LABEL - BACK" # scelta neutra se non specificato
215
 
216
  return ""
217
- # ----------------------------------------------------------------------------------------------------
218
 
219
  # --- Nuove colonne: euristiche base (si possono migliorare con esempi reali)
220
  FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
 
102
  m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
103
  return m.group(1).strip() if m else ""
104
 
105
+ # ======================================================================
106
+ # WEIGHT PARSER → robusto su OCR (spazi tra cifre) e simboli ± varianti
107
+ # Ritorna solo il valore es. "94±3g"
108
+ # ======================================================================
109
+ # numero con possibili spazi interni tra cifre (OCR): "9 4" -> 94
110
+ NUM_SPACED = r"(?:\d(?:\s?\d){0,6}(?:[.,]\d+)?)"
111
+ UNIT = r"(?:mg|g|kg)\b"
112
+ PLUSMINUS = r"(?:±|\+\s*/?\s*-\s*|[+﹢]\s*[-\-])" # ±, +/-, +-, varianti
113
+
114
  WEIGHT_VALUE_RE = re.compile(
115
+ rf"(?is)\bweight\b[^\n\r]{{0,120}}?({NUM_SPACED}\s*{PLUSMINUS}\s*{NUM_SPACED}\s*{UNIT}|{NUM_SPACED}\s*{UNIT})"
 
 
 
 
 
 
 
 
 
 
 
116
  )
117
 
118
  def _normalize_weight(s: str) -> str:
 
119
  s = (s or "").strip()
120
+ # togli spazi solo tra cifre
121
+ s = re.sub(r"(?<=\d)\s+(?=\d)", "", s)
122
+ # uniforma simboli ±
123
+ s = re.sub(r"\+\s*/?\s*-\s*", "±", s)
124
+ s = s.replace("+-", "±").replace("﹢", "+").replace("-", "-")
125
+ # togli spazi attorno a ± e prima dell'unità
126
+ s = re.sub(r"\s*±\s*", "±", s)
127
+ s = re.sub(r"\s*(mg|g|kg)\b", r"\1", s, flags=re.I)
128
+ # virgole -> punti
129
  s = s.replace(",", ".")
130
  return s
131
 
132
  def weight_from(text: str) -> str:
133
  if not text:
134
  return ""
 
135
  m = WEIGHT_VALUE_RE.search(text)
136
  if m:
137
  return _normalize_weight(m.group(1))
138
+ # Fallback riga-per-riga
139
  for line in (text or "").splitlines():
140
  if "weight" in line.lower():
141
+ m2 = WEIGHT_VALUE_RE.search(line)
 
 
 
142
  if m2:
143
  return _normalize_weight(m2.group(1))
144
+ # prova su substring dopo "weight"
145
+ try:
146
+ idx = line.lower().index("weight") + len("weight")
147
+ m3 = re.search(rf"({NUM_SPACED}\s*{PLUSMINUS}\s*{NUM_SPACED}\s*{UNIT}|{NUM_SPACED}\s*{UNIT})", line[idx:], re.I)
148
+ if m3:
149
+ return _normalize_weight(m3.group(1))
150
+ except Exception:
151
+ pass
152
  return ""
153
 
154
  # --------------------- PIECE da "Packaging Component Type" ---------------------
 
221
  return "LABEL - BACK" # scelta neutra se non specificato
222
 
223
  return ""
 
224
 
225
  # --- Nuove colonne: euristiche base (si possono migliorare con esempi reali)
226
  FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)