Spaces:

TuttiQuantis
/

PPWR_APP

Sleeping

App Files Files Community

martinofumagalli commited on Oct 2, 2025

Commit

f527521

verified ·

1 Parent(s): b686911

Upload app.py

Browse files

Files changed (1) hide show

app.py +175 -0

app.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import io, os, re
+from typing import List, Dict, Tuple
+import streamlit as st
+import pandas as pd
+# PDF text
+import pdfplumber
+from pypdf import PdfReader
+# OCR
+from pdf2image import convert_from_bytes
+import pytesseract
+from PIL import Image
+SCHEMA = ["Piece","SKU","Title","Capacity","% Recycled","Weight","Color","Material / Resin","Class","Source File"]
+# ------------------ low-level extractors ------------------
+def extract_text_pages(pdf_bytes: bytes) -> List[str]:
+    pages = []
+    # 1) pdfplumber
+    try:
+        with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+            for p in pdf.pages:
+                pages.append(p.extract_text() or "")
+    except Exception:
+        pages = []
+    # 2) pypdf fallback
+    if not pages or all(not (t or "").strip() for t in pages):
+        try:
+            reader = PdfReader(io.BytesIO(pdf_bytes))
+            pages = [(p.extract_text() or "") for p in reader.pages]
+        except Exception:
+            pages = []
+    return pages
+def run_ocr(pdf_bytes: bytes, lang: str, dpi: int, tesseract_cmd: str | None) -> List[str]:
+    if tesseract_cmd:
+        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
+    images = convert_from_bytes(pdf_bytes, dpi=dpi)
+    texts = []
+    config = "--psm 6 -c preserve_interword_spaces=1"
+    for img in images:
+        if not isinstance(img, Image.Image):
+            img = img.convert("RGB")
+        texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
+    return texts
+# ------------------ domain parsing ------------------
+SKU_RE = re.compile(r"\b(?:Name|SKU|Part(?:\s*No\.?)?)\s*[:#]?\s*([A-Z0-9\-_/\.]{5,})", re.I)
+TITLE_RE = re.compile(r"\bTitle\s*[:\-]\s*(.+)", re.I)
+CLASS_RE = re.compile(r"\bClass\s*([A-Za-z ]+)", re.I)
+def first(text: str, pattern: re.Pattern, group: int = 1) -> str:
+    m = pattern.search(text or "")
+    return m.group(group).strip() if m else ""
+def capacity_from(text: str) -> str:
+    m = re.search(r"([0-9]+(?:[.,][0-9]+)?)\s*(L|Liter|ml|mL)\b", text or "", re.I)
+    if not m: return ""
+    unit = m.group(2).upper().replace("LITER","L").replace("ML","ml")
+    return f"{m.group(1).replace(',', '.')} {unit}"
+def color_from(text: str) -> str:
+    # preferisci "Part Color" / "Color" oppure parole in MAIUSCOLO vicino a GREEN/TRANSPARENT ecc.
+    m = re.search(r"(?:Part\s*Color|Color)\s*[:\-]?\s*([A-Z ]{3,})", text, re.I)
+    if m: return m.group(1).strip()
+    m = re.search(r"\b([A-Z ]{4,}(?:GREEN|TRANSPARENT|WHITE|BLACK|BLUE|RED|CLEAR)[A-Z ]*)\b", text)
+    return (m.group(1).strip() if m else "")
+def material_from(text: str) -> str:
+    # cattura righe con "RESIN" o "SERIOPLAST ... RESIN"
+    for line in (text or "").splitlines():
+        if re.search(r"\bRESIN\b", line, re.I):
+            return line.strip()
+    m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
+    return m.group(1).strip() if m else ""
+def piece_from(text: str, cls: str) -> str:
+    # se trovi "Packaging Material Type Rigid- Bottle" → "Bottle"
+    m = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text, re.I)
+    if m:
+        seg = m.group(1)
+        m2 = re.search(r"\b(Bottle|Cap|Container|Lid|Carton|Case)\b", seg, re.I)
+        if m2: return m2.group(1).capitalize()
+    # dal Class "Bottles" → "Bottle", "Corrugated" → "Container"
+    if cls:
+        if "Bottle" in cls or "Bottles" in cls: return "Bottle"
+        if "Cap" in cls or "Closures" in cls: return "Cap"
+        if "Corrugated" in cls: return "Container"
+    return ""
+def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
+    full = "\n".join(pages or [""])
+    sku = first(full, SKU_RE)
+    title = first(full, TITLE_RE)
+    cls  = first(full, CLASS_RE)
+    cap  = capacity_from(title) or capacity_from(full)
+    color = color_from(full)
+    material = material_from(full)
+    piece = piece_from(full, cls)
+    rec = {
+        "Piece": piece or "",
+        "SKU": sku or "",
+        "Title": title or "",
+        "Capacity": cap or "",
+        "% Recycled": "–",
+        "Weight": "–",
+        "Color": color or "",
+        "Material / Resin": material or "",
+        "Class": cls or "",
+        "Source File": source_name,
+    }
+    return rec
+# ------------------ UI ------------------
+st.set_page_config(page_title="PDF → Table (OCR)", layout="wide")
+st.title("📄→📊 PDF → Table (OCR-ready)")
+st.caption("Carica PDF (anche scansioni). Per ogni file compilo: Piece, SKU, Title, Capacity, % Recycled, Weight, Color, Material / Resin, Class, Source File.")
+with st.sidebar:
+    files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
+    st.markdown("---")
+    st.subheader("OCR")
+    ocr_fallback = st.checkbox("Usa OCR se non c'è testo", value=True)
+    ocr_lang = st.text_input("Lingue OCR (comma)", value="eng,ita")
+    ocr_dpi = st.number_input("DPI OCR", 200, 600, 300, 50)
+    tess_path = st.text_input("Percorso Tesseract (se non nel PATH)", value="")
+    run_btn = st.button("▶️ Estrai")
+if not run_btn:
+    st.info("Carica i PDF e premi **Estrai**.")
+    st.stop()
+if not files:
+    st.warning("Nessun PDF caricato.")
+    st.stop()
+lang = "+".join([p.strip() for p in ocr_lang.split(",") if p.strip()]) or "eng"
+tess_cmd = tess_path.strip() or None
+rows, errors = [], []
+for up in files:
+    try:
+        raw = up.read()
+        pages = extract_text_pages(raw)
+        if ocr_fallback and not any((p or "").strip() for p in pages):
+            pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
+        rec = parse_record(pages, up.name)
+        # se nel futuro ci saranno più SKU in un PDF, qui potremmo generare più rec (lista)
+        rows.append(rec)
+    except Exception as e:
+        errors.append((up.name, str(e)))
+if errors:
+    with st.expander("Errori"):
+        for name, err in errors:
+            st.error(f"{name}: {err}")
+df = pd.DataFrame(rows, columns=SCHEMA)
+st.success(f"Creat{ 'e' if len(df)!=1 else 'a' } {len(df)} riga/e.")
+st.dataframe(df, use_container_width=True)
+c1, c2 = st.columns(2)
+with c1:
+    st.download_button("⬇️ CSV", df.to_csv(index=False).encode("utf-8"), "table.csv", "text/csv")
+with c2:
+    bio = io.BytesIO()
+    with pd.ExcelWriter(bio, engine="openpyxl") as xw:
+        df.to_excel(xw, index=False, sheet_name="data")
+    st.download_button("⬇️ Excel", bio.getvalue(), "table.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")