Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -54,6 +54,23 @@ def run_ocr(pdf_bytes: bytes, lang: str, dpi: int, tesseract_cmd: str | None) ->
|
|
| 54 |
texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
|
| 55 |
return texts
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
# ======================================================================
|
| 58 |
# PARSING DOMINIO (euristiche/regex leggere)
|
| 59 |
# ======================================================================
|
|
@@ -85,7 +102,7 @@ def material_from(text: str) -> str:
|
|
| 85 |
m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
|
| 86 |
return m.group(1).strip() if m else ""
|
| 87 |
|
| 88 |
-
# --- WEIGHT PARSER β restituisce solo il valore, es. "94Β±3g"
|
| 89 |
WEIGHT_VALUE_RE = re.compile(
|
| 90 |
r"""(?ix)
|
| 91 |
\bweight\b
|
|
@@ -127,9 +144,7 @@ def weight_from(text: str) -> str:
|
|
| 127 |
return _normalize_weight(m2.group(1))
|
| 128 |
return ""
|
| 129 |
|
| 130 |
-
# ------------------------------------------
|
| 131 |
-
|
| 132 |
-
# --------------------- AGGIUNTA RICHIESTA: PIECE da "Packaging Component Type" ---------------------
|
| 133 |
_ALLOWED_PIECES = {
|
| 134 |
"ribbon": "ribbon",
|
| 135 |
"bottle": "bottle",
|
|
@@ -168,10 +183,6 @@ def _normalize_piece(s: str) -> str:
|
|
| 168 |
return ""
|
| 169 |
|
| 170 |
def piece_from(text: str, cls: str) -> str:
|
| 171 |
-
"""
|
| 172 |
-
1) Cerca 'Packaging Component Type: <valore>' e normalizza al set richiesto.
|
| 173 |
-
2) Se non trovato, usa vecchi fallback (Class/Material Type).
|
| 174 |
-
"""
|
| 175 |
# 1) Packaging Component Type (linea dedicata)
|
| 176 |
m = _PACK_COMP_TYPE_RE.search(text or "")
|
| 177 |
if m:
|
|
@@ -283,7 +294,7 @@ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
|
|
| 283 |
gcas = material_ref_gcas_from(full)
|
| 284 |
mfam = material_family_from(full)
|
| 285 |
|
| 286 |
-
#
|
| 287 |
wght = weight_from(full)
|
| 288 |
|
| 289 |
return {
|
|
@@ -309,7 +320,7 @@ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
|
|
| 309 |
# ======================================================================
|
| 310 |
st.set_page_config(page_title="PDF β Table (OCR-ready)", layout="wide")
|
| 311 |
st.title("πβπ PDF β Table (OCR-ready)")
|
| 312 |
-
st.caption("Carica PDF (anche scansioni). Compilo la tabella con i campi richiesti; OCR
|
| 313 |
|
| 314 |
with st.sidebar:
|
| 315 |
files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
|
|
@@ -337,13 +348,16 @@ for up in files:
|
|
| 337 |
try:
|
| 338 |
raw = up.read()
|
| 339 |
pages = extract_text_pages(raw)
|
|
|
|
|
|
|
| 340 |
if ocr_fallback and not any((p or "").strip() for p in pages):
|
| 341 |
pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
|
|
|
|
| 342 |
rec = parse_record(pages, up.name)
|
| 343 |
-
|
|
|
|
| 344 |
if (not rec.get("Weight") or rec["Weight"] == "β") and ocr_fallback:
|
| 345 |
-
|
| 346 |
-
w_ocr = weight_from("\n".join(ocr_pages))
|
| 347 |
if w_ocr:
|
| 348 |
rec["Weight"] = w_ocr
|
| 349 |
|
|
@@ -368,4 +382,3 @@ with c2:
|
|
| 368 |
with pd.ExcelWriter(bio, engine="openpyxl") as xw:
|
| 369 |
df.to_excel(xw, index=False, sheet_name="data")
|
| 370 |
st.download_button("β¬οΈ Excel", bio.getvalue(), "table.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
| 371 |
-
|
|
|
|
| 54 |
texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
|
| 55 |
return texts
|
| 56 |
|
| 57 |
+
# --- OCR rapido SOLO per il peso (prime pagine, DPI bassi, stop appena trovato)
|
| 58 |
+
def run_ocr_for_weight(pdf_bytes: bytes, lang: str, tesseract_cmd: str | None, max_pages: int = 2, dpi_weight: int = 200) -> str:
|
| 59 |
+
if tesseract_cmd:
|
| 60 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
|
| 61 |
+
images = convert_from_bytes(pdf_bytes, dpi=dpi_weight, first_page=1, last_page=max_pages)
|
| 62 |
+
config = "--psm 6 -c preserve_interword_spaces=1"
|
| 63 |
+
acc = []
|
| 64 |
+
for img in images:
|
| 65 |
+
if not isinstance(img, Image.Image):
|
| 66 |
+
img = img.convert("RGB")
|
| 67 |
+
txt = pytesseract.image_to_string(img, lang=lang, config=config) or ""
|
| 68 |
+
w = weight_from(txt) # definita sotto
|
| 69 |
+
if w:
|
| 70 |
+
return w
|
| 71 |
+
acc.append(txt)
|
| 72 |
+
return weight_from("\n".join(acc)) or ""
|
| 73 |
+
|
| 74 |
# ======================================================================
|
| 75 |
# PARSING DOMINIO (euristiche/regex leggere)
|
| 76 |
# ======================================================================
|
|
|
|
| 102 |
m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
|
| 103 |
return m.group(1).strip() if m else ""
|
| 104 |
|
| 105 |
+
# --- WEIGHT PARSER β restituisce solo il valore, es. "94Β±3g"
|
| 106 |
WEIGHT_VALUE_RE = re.compile(
|
| 107 |
r"""(?ix)
|
| 108 |
\bweight\b
|
|
|
|
| 144 |
return _normalize_weight(m2.group(1))
|
| 145 |
return ""
|
| 146 |
|
| 147 |
+
# --------------------- PIECE da "Packaging Component Type" ---------------------
|
|
|
|
|
|
|
| 148 |
_ALLOWED_PIECES = {
|
| 149 |
"ribbon": "ribbon",
|
| 150 |
"bottle": "bottle",
|
|
|
|
| 183 |
return ""
|
| 184 |
|
| 185 |
def piece_from(text: str, cls: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
# 1) Packaging Component Type (linea dedicata)
|
| 187 |
m = _PACK_COMP_TYPE_RE.search(text or "")
|
| 188 |
if m:
|
|
|
|
| 294 |
gcas = material_ref_gcas_from(full)
|
| 295 |
mfam = material_family_from(full)
|
| 296 |
|
| 297 |
+
# estrai peso dal testo (se presente come testo digitale)
|
| 298 |
wght = weight_from(full)
|
| 299 |
|
| 300 |
return {
|
|
|
|
| 320 |
# ======================================================================
|
| 321 |
st.set_page_config(page_title="PDF β Table (OCR-ready)", layout="wide")
|
| 322 |
st.title("πβπ PDF β Table (OCR-ready)")
|
| 323 |
+
st.caption("Carica PDF (anche scansioni). Compilo la tabella con i campi richiesti; OCR mirato per il peso.")
|
| 324 |
|
| 325 |
with st.sidebar:
|
| 326 |
files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
|
|
|
|
| 348 |
try:
|
| 349 |
raw = up.read()
|
| 350 |
pages = extract_text_pages(raw)
|
| 351 |
+
|
| 352 |
+
# Se il PDF non ha testo estraibile, fai OCR completo una sola volta
|
| 353 |
if ocr_fallback and not any((p or "").strip() for p in pages):
|
| 354 |
pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
|
| 355 |
+
|
| 356 |
rec = parse_record(pages, up.name)
|
| 357 |
+
|
| 358 |
+
# Se Weight Γ¨ vuoto, OCR rapido sulle prime pagine e stop appena trovato
|
| 359 |
if (not rec.get("Weight") or rec["Weight"] == "β") and ocr_fallback:
|
| 360 |
+
w_ocr = run_ocr_for_weight(raw, lang=lang, tesseract_cmd=tess_cmd, max_pages=2, dpi_weight=200)
|
|
|
|
| 361 |
if w_ocr:
|
| 362 |
rec["Weight"] = w_ocr
|
| 363 |
|
|
|
|
| 382 |
with pd.ExcelWriter(bio, engine="openpyxl") as xw:
|
| 383 |
df.to_excel(xw, index=False, sheet_name="data")
|
| 384 |
st.download_button("β¬οΈ Excel", bio.getvalue(), "table.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
|
|