martinofumagalli commited on
Commit
f527521
·
verified ·
1 Parent(s): b686911

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -0
app.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io, os, re
2
+ from typing import List, Dict, Tuple
3
+ import streamlit as st
4
+ import pandas as pd
5
+
6
+ # PDF text
7
+ import pdfplumber
8
+ from pypdf import PdfReader
9
+
10
+ # OCR
11
+ from pdf2image import convert_from_bytes
12
+ import pytesseract
13
+ from PIL import Image
14
+
15
+ SCHEMA = ["Piece","SKU","Title","Capacity","% Recycled","Weight","Color","Material / Resin","Class","Source File"]
16
+
17
+ # ------------------ low-level extractors ------------------
18
+
19
+ def extract_text_pages(pdf_bytes: bytes) -> List[str]:
20
+ pages = []
21
+ # 1) pdfplumber
22
+ try:
23
+ with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
24
+ for p in pdf.pages:
25
+ pages.append(p.extract_text() or "")
26
+ except Exception:
27
+ pages = []
28
+ # 2) pypdf fallback
29
+ if not pages or all(not (t or "").strip() for t in pages):
30
+ try:
31
+ reader = PdfReader(io.BytesIO(pdf_bytes))
32
+ pages = [(p.extract_text() or "") for p in reader.pages]
33
+ except Exception:
34
+ pages = []
35
+ return pages
36
+
37
+ def run_ocr(pdf_bytes: bytes, lang: str, dpi: int, tesseract_cmd: str | None) -> List[str]:
38
+ if tesseract_cmd:
39
+ pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
40
+ images = convert_from_bytes(pdf_bytes, dpi=dpi)
41
+ texts = []
42
+ config = "--psm 6 -c preserve_interword_spaces=1"
43
+ for img in images:
44
+ if not isinstance(img, Image.Image):
45
+ img = img.convert("RGB")
46
+ texts.append(pytesseract.image_to_string(img, lang=lang, config=config) or "")
47
+ return texts
48
+
49
+ # ------------------ domain parsing ------------------
50
+
51
+ SKU_RE = re.compile(r"\b(?:Name|SKU|Part(?:\s*No\.?)?)\s*[:#]?\s*([A-Z0-9\-_/\.]{5,})", re.I)
52
+ TITLE_RE = re.compile(r"\bTitle\s*[:\-]\s*(.+)", re.I)
53
+ CLASS_RE = re.compile(r"\bClass\s*([A-Za-z ]+)", re.I)
54
+
55
+ def first(text: str, pattern: re.Pattern, group: int = 1) -> str:
56
+ m = pattern.search(text or "")
57
+ return m.group(group).strip() if m else ""
58
+
59
+ def capacity_from(text: str) -> str:
60
+ m = re.search(r"([0-9]+(?:[.,][0-9]+)?)\s*(L|Liter|ml|mL)\b", text or "", re.I)
61
+ if not m: return ""
62
+ unit = m.group(2).upper().replace("LITER","L").replace("ML","ml")
63
+ return f"{m.group(1).replace(',', '.')} {unit}"
64
+
65
+ def color_from(text: str) -> str:
66
+ # preferisci "Part Color" / "Color" oppure parole in MAIUSCOLO vicino a GREEN/TRANSPARENT ecc.
67
+ m = re.search(r"(?:Part\s*Color|Color)\s*[:\-]?\s*([A-Z ]{3,})", text, re.I)
68
+ if m: return m.group(1).strip()
69
+ m = re.search(r"\b([A-Z ]{4,}(?:GREEN|TRANSPARENT|WHITE|BLACK|BLUE|RED|CLEAR)[A-Z ]*)\b", text)
70
+ return (m.group(1).strip() if m else "")
71
+
72
+ def material_from(text: str) -> str:
73
+ # cattura righe con "RESIN" o "SERIOPLAST ... RESIN"
74
+ for line in (text or "").splitlines():
75
+ if re.search(r"\bRESIN\b", line, re.I):
76
+ return line.strip()
77
+ m = re.search(r"(SERIOPLAST.*?RESIN)", text, re.I)
78
+ return m.group(1).strip() if m else ""
79
+
80
+ def piece_from(text: str, cls: str) -> str:
81
+ # se trovi "Packaging Material Type Rigid- Bottle" → "Bottle"
82
+ m = re.search(r"Packaging\s*Material\s*Type\s*([^\n]+)", text, re.I)
83
+ if m:
84
+ seg = m.group(1)
85
+ m2 = re.search(r"\b(Bottle|Cap|Container|Lid|Carton|Case)\b", seg, re.I)
86
+ if m2: return m2.group(1).capitalize()
87
+ # dal Class "Bottles" → "Bottle", "Corrugated" → "Container"
88
+ if cls:
89
+ if "Bottle" in cls or "Bottles" in cls: return "Bottle"
90
+ if "Cap" in cls or "Closures" in cls: return "Cap"
91
+ if "Corrugated" in cls: return "Container"
92
+ return ""
93
+
94
+ def parse_record(pages: List[str], source_name: str) -> Dict[str, str]:
95
+ full = "\n".join(pages or [""])
96
+ sku = first(full, SKU_RE)
97
+ title = first(full, TITLE_RE)
98
+ cls = first(full, CLASS_RE)
99
+ cap = capacity_from(title) or capacity_from(full)
100
+ color = color_from(full)
101
+ material = material_from(full)
102
+ piece = piece_from(full, cls)
103
+
104
+ rec = {
105
+ "Piece": piece or "",
106
+ "SKU": sku or "",
107
+ "Title": title or "",
108
+ "Capacity": cap or "",
109
+ "% Recycled": "–",
110
+ "Weight": "–",
111
+ "Color": color or "",
112
+ "Material / Resin": material or "",
113
+ "Class": cls or "",
114
+ "Source File": source_name,
115
+ }
116
+ return rec
117
+
118
+ # ------------------ UI ------------------
119
+
120
+ st.set_page_config(page_title="PDF → Table (OCR)", layout="wide")
121
+ st.title("📄→📊 PDF → Table (OCR-ready)")
122
+ st.caption("Carica PDF (anche scansioni). Per ogni file compilo: Piece, SKU, Title, Capacity, % Recycled, Weight, Color, Material / Resin, Class, Source File.")
123
+
124
+ with st.sidebar:
125
+ files = st.file_uploader("Seleziona PDF", type=["pdf"], accept_multiple_files=True)
126
+ st.markdown("---")
127
+ st.subheader("OCR")
128
+ ocr_fallback = st.checkbox("Usa OCR se non c'è testo", value=True)
129
+ ocr_lang = st.text_input("Lingue OCR (comma)", value="eng,ita")
130
+ ocr_dpi = st.number_input("DPI OCR", 200, 600, 300, 50)
131
+ tess_path = st.text_input("Percorso Tesseract (se non nel PATH)", value="")
132
+ run_btn = st.button("▶️ Estrai")
133
+
134
+ if not run_btn:
135
+ st.info("Carica i PDF e premi **Estrai**.")
136
+ st.stop()
137
+
138
+ if not files:
139
+ st.warning("Nessun PDF caricato.")
140
+ st.stop()
141
+
142
+ lang = "+".join([p.strip() for p in ocr_lang.split(",") if p.strip()]) or "eng"
143
+ tess_cmd = tess_path.strip() or None
144
+
145
+ rows, errors = [], []
146
+ for up in files:
147
+ try:
148
+ raw = up.read()
149
+ pages = extract_text_pages(raw)
150
+ if ocr_fallback and not any((p or "").strip() for p in pages):
151
+ pages = run_ocr(raw, lang=lang, dpi=int(ocr_dpi), tesseract_cmd=tess_cmd)
152
+ rec = parse_record(pages, up.name)
153
+ # se nel futuro ci saranno più SKU in un PDF, qui potremmo generare più rec (lista)
154
+ rows.append(rec)
155
+ except Exception as e:
156
+ errors.append((up.name, str(e)))
157
+
158
+ if errors:
159
+ with st.expander("Errori"):
160
+ for name, err in errors:
161
+ st.error(f"{name}: {err}")
162
+
163
+ df = pd.DataFrame(rows, columns=SCHEMA)
164
+ st.success(f"Creat{ 'e' if len(df)!=1 else 'a' } {len(df)} riga/e.")
165
+ st.dataframe(df, use_container_width=True)
166
+
167
+ c1, c2 = st.columns(2)
168
+ with c1:
169
+ st.download_button("⬇️ CSV", df.to_csv(index=False).encode("utf-8"), "table.csv", "text/csv")
170
+ with c2:
171
+ bio = io.BytesIO()
172
+ with pd.ExcelWriter(bio, engine="openpyxl") as xw:
173
+ df.to_excel(xw, index=False, sheet_name="data")
174
+ st.download_button("⬇️ Excel", bio.getvalue(), "table.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
175
+