Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -102,46 +102,53 @@ def material_from(text: str) -> str:
|
|
| 102 |
m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
|
| 103 |
return m.group(1).strip() if m else ""
|
| 104 |
|
| 105 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
WEIGHT_VALUE_RE = re.compile(
|
| 107 |
-
|
| 108 |
-
\bweight\b
|
| 109 |
-
[^\n\r]{0,80}? # pochi caratteri sulla stessa riga
|
| 110 |
-
(
|
| 111 |
-
(?:\d+(?:[.,]\d+)?\s* # valore principale (94 o 94,5)
|
| 112 |
-
(?:±|\+/?-|\+-)\s* # simbolo tolleranza (±, +/- o +-)
|
| 113 |
-
\d+(?:[.,]\d+)?\s* # tolleranza (3 o 3,0)
|
| 114 |
-
(?:mg|g|kg)) # unità
|
| 115 |
-
|
|
| 116 |
-
(?:\d+(?:[.,]\d+)?\s*(?:mg|g|kg)) # fallback: solo "94 g"
|
| 117 |
-
)
|
| 118 |
-
""",
|
| 119 |
)
|
| 120 |
|
| 121 |
def _normalize_weight(s: str) -> str:
|
| 122 |
-
# compatta spazi, uniforma simboli/virgole → "94±3g"
|
| 123 |
s = (s or "").strip()
|
| 124 |
-
|
| 125 |
-
s =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
s = s.replace(",", ".")
|
| 127 |
return s
|
| 128 |
|
| 129 |
def weight_from(text: str) -> str:
|
| 130 |
if not text:
|
| 131 |
return ""
|
| 132 |
-
# 1) match diretto nel testo complessivo
|
| 133 |
m = WEIGHT_VALUE_RE.search(text)
|
| 134 |
if m:
|
| 135 |
return _normalize_weight(m.group(1))
|
| 136 |
-
#
|
| 137 |
for line in (text or "").splitlines():
|
| 138 |
if "weight" in line.lower():
|
| 139 |
-
m2 =
|
| 140 |
-
r"(?ix)\bweight\b[^\n\r]*?((?:\d+(?:[.,]\d+)?\s*(?:±|\+/?-|\+-)\s*\d+(?:[.,]\d+)?\s*(?:mg|g|kg))|(?:\d+(?:[.,]\d+)?\s*(?:mg|g|kg)))",
|
| 141 |
-
line,
|
| 142 |
-
)
|
| 143 |
if m2:
|
| 144 |
return _normalize_weight(m2.group(1))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
return ""
|
| 146 |
|
| 147 |
# --------------------- PIECE da "Packaging Component Type" ---------------------
|
|
@@ -214,7 +221,6 @@ def piece_from(text: str, cls: str) -> str:
|
|
| 214 |
return "LABEL - BACK" # scelta neutra se non specificato
|
| 215 |
|
| 216 |
return ""
|
| 217 |
-
# ----------------------------------------------------------------------------------------------------
|
| 218 |
|
| 219 |
# --- Nuove colonne: euristiche base (si possono migliorare con esempi reali)
|
| 220 |
FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
|
|
|
|
| 102 |
m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
|
| 103 |
return m.group(1).strip() if m else ""
|
| 104 |
|
| 105 |
+
# ======================================================================
|
| 106 |
+
# WEIGHT PARSER → robusto su OCR (spazi tra cifre) e simboli ± varianti
|
| 107 |
+
# Ritorna solo il valore es. "94±3g"
|
| 108 |
+
# ======================================================================
|
| 109 |
+
# numero con possibili spazi interni tra cifre (OCR): "9 4" -> 94
|
| 110 |
+
NUM_SPACED = r"(?:\d(?:\s?\d){0,6}(?:[.,]\d+)?)"
|
| 111 |
+
UNIT = r"(?:mg|g|kg)\b"
|
| 112 |
+
PLUSMINUS = r"(?:±|\+\s*/?\s*-\s*|[+﹢]\s*[-\-])" # ±, +/-, +-, varianti
|
| 113 |
+
|
| 114 |
WEIGHT_VALUE_RE = re.compile(
|
| 115 |
+
rf"(?is)\bweight\b[^\n\r]{{0,120}}?({NUM_SPACED}\s*{PLUSMINUS}\s*{NUM_SPACED}\s*{UNIT}|{NUM_SPACED}\s*{UNIT})"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
)
|
| 117 |
|
| 118 |
def _normalize_weight(s: str) -> str:
|
|
|
|
| 119 |
s = (s or "").strip()
|
| 120 |
+
# togli spazi solo tra cifre
|
| 121 |
+
s = re.sub(r"(?<=\d)\s+(?=\d)", "", s)
|
| 122 |
+
# uniforma simboli ±
|
| 123 |
+
s = re.sub(r"\+\s*/?\s*-\s*", "±", s)
|
| 124 |
+
s = s.replace("+-", "±").replace("﹢", "+").replace("-", "-")
|
| 125 |
+
# togli spazi attorno a ± e prima dell'unità
|
| 126 |
+
s = re.sub(r"\s*±\s*", "±", s)
|
| 127 |
+
s = re.sub(r"\s*(mg|g|kg)\b", r"\1", s, flags=re.I)
|
| 128 |
+
# virgole -> punti
|
| 129 |
s = s.replace(",", ".")
|
| 130 |
return s
|
| 131 |
|
| 132 |
def weight_from(text: str) -> str:
|
| 133 |
if not text:
|
| 134 |
return ""
|
|
|
|
| 135 |
m = WEIGHT_VALUE_RE.search(text)
|
| 136 |
if m:
|
| 137 |
return _normalize_weight(m.group(1))
|
| 138 |
+
# Fallback riga-per-riga
|
| 139 |
for line in (text or "").splitlines():
|
| 140 |
if "weight" in line.lower():
|
| 141 |
+
m2 = WEIGHT_VALUE_RE.search(line)
|
|
|
|
|
|
|
|
|
|
| 142 |
if m2:
|
| 143 |
return _normalize_weight(m2.group(1))
|
| 144 |
+
# prova su substring dopo "weight"
|
| 145 |
+
try:
|
| 146 |
+
idx = line.lower().index("weight") + len("weight")
|
| 147 |
+
m3 = re.search(rf"({NUM_SPACED}\s*{PLUSMINUS}\s*{NUM_SPACED}\s*{UNIT}|{NUM_SPACED}\s*{UNIT})", line[idx:], re.I)
|
| 148 |
+
if m3:
|
| 149 |
+
return _normalize_weight(m3.group(1))
|
| 150 |
+
except Exception:
|
| 151 |
+
pass
|
| 152 |
return ""
|
| 153 |
|
| 154 |
# --------------------- PIECE da "Packaging Component Type" ---------------------
|
|
|
|
| 221 |
return "LABEL - BACK" # scelta neutra se non specificato
|
| 222 |
|
| 223 |
return ""
|
|
|
|
| 224 |
|
| 225 |
# --- Nuove colonne: euristiche base (si possono migliorare con esempi reali)
|
| 226 |
FUNCTION_RE = re.compile(r"\b(Primary|Secondary(?:\s*or\s*Tertiary)?|Tertiary)\b", re.I)
|