AnthoneoJ commited on
Commit
03e0267
Β·
verified Β·
1 Parent(s): eb8acc1

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +405 -37
src/streamlit_app.py CHANGED
@@ -1,40 +1,408 @@
1
- import altair as alt
2
- import numpy as np
 
 
3
  import pandas as pd
4
  import streamlit as st
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import tempfile
4
+ import re
5
  import pandas as pd
6
  import streamlit as st
7
+ import camelot
8
+ from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
9
+ from rapidfuzz import fuzz
10
 
11
+ APP_VERSION = "v0.0.1 (2025-09-03)" # <- update this when you ship
12
+
13
+ st.set_page_config(page_title="PDF β†’ Tables Cleaner", layout="wide")
14
+ st.title("PDF Table Merge & Cleanup (Camelot β†’ Ag-Grid)")
15
+
16
+ # ---------------- Helpers ----------------
17
+ def parse_excel_to_all_dfs(
18
+ file_bytes: bytes,
19
+ sheet: str | int | None,
20
+ first_row_is_header: bool,
21
+ skiprows: int = 0,
22
+ skipcols: int = 0,
23
+ last_row: int = 0, # 1-based, 0 = till end
24
+ last_col: int = 0, # 1-based, 0 = till end
25
+ ):
26
+ """
27
+ Return list[pd.DataFrame] from an Excel file, cropped to a rectangle.
28
+ Cropping logic:
29
+ - Drop the first `skiprows` rows and first `skipcols` columns
30
+ - If last_row > 0, keep rows up to `last_row` (1-based) AFTER the initial sheet start
31
+ - If last_col > 0, keep cols up to `last_col` (1-based) AFTER the initial sheet start
32
+ """
33
+ header = 0 if first_row_is_header else None
34
+ all_dfs = []
35
+
36
+ bio = io.BytesIO(file_bytes)
37
+ xls = pd.ExcelFile(bio)
38
+ sheet_names = xls.sheet_names
39
+ targets = sheet_names if sheet is None else [sheet]
40
+
41
+ for s in targets:
42
+ # Read after skipping top rows
43
+ df = pd.read_excel(io.BytesIO(file_bytes), sheet_name=s, header=header, dtype=str, skiprows=skiprows)
44
+
45
+ # Apply last_row (relative to the sheet start; after skipping)
46
+ if last_row and last_row > 0:
47
+ # Convert to 0-based slice length AFTER skiprows
48
+ nrows_after_skip = max(last_row - skiprows, 0)
49
+ df = df.iloc[:nrows_after_skip, :]
50
+
51
+ # Cut left columns, then apply last_col relative to that
52
+ df = df.iloc[:, skipcols:]
53
+ if last_col and last_col > 0:
54
+ ncols_after_skip = max(last_col - skipcols, 0)
55
+ df = df.iloc[:, :ncols_after_skip]
56
+
57
+ df.columns = [str(c) for c in df.columns]
58
+ all_dfs.append(df)
59
+
60
+ return all_dfs, sheet_names
61
+
62
+ def parse_csv_to_all_dfs(
63
+ file_bytes: bytes,
64
+ first_row_is_header: bool,
65
+ sep: str = ",",
66
+ skiprows: int = 0,
67
+ skipcols: int = 0,
68
+ last_row: int = 0, # 1-based, 0 = till end
69
+ last_col: int = 0, # 1-based, 0 = till end
70
+ ):
71
+ """
72
+ Return list[pd.DataFrame] from a CSV file (single table).
73
+ """
74
+ header = 0 if first_row_is_header else None
75
+
76
+ # Read after skipping top rows
77
+ df = pd.read_csv(io.BytesIO(file_bytes), header=header, sep=sep, dtype=str, encoding="utf-8-sig", skiprows=skiprows)
78
+
79
+ # Apply last_row (after skips)
80
+ if last_row and last_row > 0:
81
+ nrows_after_skip = max(last_row - skiprows, 0)
82
+ df = df.iloc[:nrows_after_skip, :]
83
+
84
+ # Columns window
85
+ df = df.iloc[:, skipcols:]
86
+ if last_col and last_col > 0:
87
+ ncols_after_skip = max(last_col - skipcols, 0)
88
+ df = df.iloc[:, :ncols_after_skip]
89
+
90
+ df.columns = [str(c) for c in df.columns]
91
+ return [df]
92
+
93
+ def prepend_header_as_row(df: pd.DataFrame) -> pd.DataFrame:
94
+ cols = [str(c) for c in df.columns]
95
+ header_row = pd.DataFrame([cols], columns=cols)
96
+ return pd.concat([header_row, df.reset_index(drop=True)], ignore_index=True)
97
+
98
+ def normalize_and_concat(dfs, fill_value=""):
99
+ """Prepend header rows, pad to the widest table, rename columns to col_1.., then concat."""
100
+ if not dfs:
101
+ return pd.DataFrame()
102
+
103
+ dfs = [prepend_header_as_row(df) for df in dfs]
104
+ max_cols = max(df.shape[1] for df in dfs)
105
+
106
+ norm = []
107
+ for df in dfs:
108
+ df2 = df.copy()
109
+ df2.columns = [str(c) for c in df2.columns]
110
+ # pad or trim
111
+ if df2.shape[1] < max_cols:
112
+ for k in range(df2.shape[1], max_cols):
113
+ df2[f"__pad_{k+1}"] = fill_value
114
+ elif df2.shape[1] > max_cols:
115
+ df2 = df2.iloc[:, :max_cols]
116
+ df2.columns = [f"col_{i+1}" for i in range(max_cols)]
117
+ norm.append(df2.reset_index(drop=True))
118
+
119
+ out = pd.concat(norm, ignore_index=True)
120
+ # add stable row ids for deletion
121
+ out["_rid"] = range(len(out))
122
+ return out
123
+
124
+ def apply_header_row(df: pd.DataFrame, header_idx: int, ensure_unique: bool = False):
125
+ """
126
+ Promote the row at header_idx to be the header *as-is* (no lowercasing, no regex).
127
+ Returns (df_with_header, header_vals).
128
+
129
+ If ensure_unique=True, only then suffix duplicates with _2, _3, ...;
130
+ otherwise duplicate column names are allowed (pandas can handle them, but be careful).
131
+ """
132
+ # Preserve internal id if present
133
+ has_rid = "_rid" in df.columns
134
+ body_cols = [c for c in df.columns if c != "_rid"]
135
+
136
+ # Get raw header values exactly as the user sees them
137
+ header_vals = df.loc[header_idx, body_cols].tolist()
138
+
139
+ # Optionally enforce unique column names without altering originals unless needed
140
+ if ensure_unique:
141
+ seen = {}
142
+ uniq = []
143
+ for h in header_vals:
144
+ h = "" if h is None else str(h)
145
+ if h in seen:
146
+ seen[h] += 1
147
+ uniq.append(f"{h}_{seen[h]}")
148
+ else:
149
+ seen[h] = 1
150
+ uniq.append(h)
151
+ header_out = uniq
152
+ else:
153
+ header_out = header_vals
154
+
155
+ # Drop the header row from the data (don’t transform any cell values)
156
+ df2 = df.drop(index=header_idx).reset_index(drop=True)
157
+
158
+ # Reorder columns so body columns are first, then _rid (if present)
159
+ ordered_cols = body_cols + (["_rid"] if has_rid else [])
160
+ df2 = df2[ordered_cols]
161
+
162
+ # Set columns exactly as chosen header row (plus _rid if present)
163
+ df2.columns = header_out + (["_rid"] if has_rid else [])
164
+
165
+ return df2, header_vals # header_vals are the raw originals
166
+
167
+ def is_header_like(row_vals, header_vals, min_ratio=90):
168
+ sims = []
169
+ for a, b in zip(row_vals, header_vals):
170
+ a, b = str(a or "").strip(), str(b or "").strip()
171
+ sims.append(fuzz.token_set_ratio(a, b) if (a or b) else 100)
172
+ return (sum(sims) / max(len(sims), 1)) >= min_ratio
173
+
174
+ def drop_header_like_rows(df: pd.DataFrame, header_vals, min_ratio=90):
175
+ body_cols = [c for c in df.columns if c != "_rid"]
176
+ keep = []
177
+ for _, row in df.iterrows():
178
+ if not is_header_like([row[c] for c in body_cols], header_vals, min_ratio):
179
+ keep.append(True)
180
+ else:
181
+ keep.append(False)
182
+ out = df.loc[keep].reset_index(drop=True)
183
+ out["_rid"] = range(len(out))
184
+ return out
185
+
186
+ def parse_pdf_to_all_dfs(pdf_bytes: bytes):
187
+ """Parse a PDF bytes object with Camelot, return list[pd.DataFrame]."""
188
+ # write to temp file for Camelot
189
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
190
+ tmp.write(pdf_bytes)
191
+ tmp_path = tmp.name
192
+
193
+ all_dfs = []
194
+
195
+ try:
196
+ # 1) Try lattice (works best with ruled tables)
197
+ tables = camelot.read_pdf(tmp_path, pages="all", flavor="lattice")
198
+ if len(tables) == 0:
199
+ # 2) fallback to stream (works for borderless tables)
200
+ tables = camelot.read_pdf(tmp_path, pages="all", flavor="stream")
201
+
202
+ for t in tables:
203
+ df = t.df
204
+ # Your logic: promote first row to header, drop that row
205
+ if df.shape[0] > 0:
206
+ df.columns = df.iloc[0]
207
+ df = df.drop(0).reset_index(drop=True)
208
+ # Standardize column names to strings
209
+ df.columns = [str(c) for c in df.columns]
210
+ all_dfs.append(df)
211
+
212
+ finally:
213
+ try:
214
+ os.remove(tmp_path)
215
+ except Exception:
216
+ pass
217
+
218
+ return all_dfs
219
+
220
+ # ---------------- Sidebar: Upload & Parse ----------------
221
+ with st.sidebar:
222
+ st.divider()
223
+ st.caption(f"App {APP_VERSION}")
224
+
225
+ st.header("1) Upload file")
226
+ upl = st.file_uploader("Choose a PDF / Excel / CSV", type=["pdf", "xlsx", "xls", "csv"])
227
+
228
+ filetype = None
229
+ if upl is not None:
230
+ name = upl.name.lower()
231
+ if name.endswith(".pdf"):
232
+ filetype = "pdf"
233
+ elif name.endswith(".xlsx") or name.endswith(".xls"):
234
+ filetype = "excel"
235
+ elif name.endswith(".csv"):
236
+ filetype = "csv"
237
+
238
+ # Common options for tabular files (Excel/CSV)
239
+ first_row_is_header = st.checkbox("First row contains headers", value=True, help="For Excel/CSV only")
240
+
241
+ # Excel/CSV region cropping
242
+ skiprows = st.number_input("Skip N rows (top)", min_value=0, value=0, step=1)
243
+ skipcols = st.number_input("Skip N columns (left)", min_value=0, value=0, step=1)
244
+ last_row = st.number_input("Last data row (1-based, 0 = until end)", min_value=0, value=0, step=1)
245
+ last_col = st.number_input("Last data column (1-based, 0 = until end)", min_value=0, value=0, step=1)
246
+
247
+ # Excel sheet selection UI (shown only when an Excel is uploaded)
248
+ selected_sheet = None
249
+ parse_all_sheets = False
250
+ excel_sheet_names = []
251
+ if filetype == "excel" and upl is not None:
252
+ # Peek the workbook to list sheets
253
+ _, excel_sheet_names = parse_excel_to_all_dfs(upl.read(), sheet=None, first_row_is_header=first_row_is_header)
254
+ # re-read as the previous call consumed the stream
255
+ upl.seek(0)
256
+
257
+ if len(excel_sheet_names) > 1:
258
+ mode = st.radio("Sheet mode", ["Select one sheet", "Parse all sheets"], index=0, horizontal=True)
259
+ if mode == "Parse all sheets":
260
+ parse_all_sheets = True
261
+ else:
262
+ selected_sheet = st.selectbox("Select sheet", excel_sheet_names, index=0)
263
+ else:
264
+ st.caption(f"Sheet: {excel_sheet_names[0]}")
265
+ selected_sheet = excel_sheet_names[0]
266
+
267
+ run_parse = st.button("Parse file")
268
+
269
+ if "concat_df" not in st.session_state:
270
+ st.session_state.concat_df = pd.DataFrame()
271
+
272
+ if run_parse:
273
+ if not upl:
274
+ st.warning("Please upload a file first.")
275
+ else:
276
+ file_bytes = upl.read()
277
+ all_dfs = []
278
+
279
+ if filetype == "pdf":
280
+ # existing PDF β†’ Camelot code path you already have
281
+ all_dfs = parse_pdf_to_all_dfs(file_bytes)
282
+
283
+ elif filetype == "excel":
284
+ if parse_all_sheets:
285
+ all_dfs, _ = parse_excel_to_all_dfs(
286
+ file_bytes,
287
+ sheet=None,
288
+ first_row_is_header=first_row_is_header,
289
+ skiprows=skiprows,
290
+ skipcols=skipcols,
291
+ last_row=last_row,
292
+ last_col=last_col,
293
+ )
294
+ else:
295
+ # If workbook has only one sheet, selected_sheet is set above
296
+ all_dfs, _ = parse_excel_to_all_dfs(
297
+ file_bytes,
298
+ sheet=selected_sheet,
299
+ first_row_is_header=first_row_is_header,
300
+ skiprows=skiprows,
301
+ skipcols=skipcols,
302
+ last_row=last_row,
303
+ last_col=last_col,
304
+ )
305
+
306
+ elif filetype == "csv":
307
+ all_dfs = parse_csv_to_all_dfs(
308
+ file_bytes,
309
+ first_row_is_header=first_row_is_header,
310
+ sep=",",
311
+ skiprows=skiprows,
312
+ skipcols=skipcols,
313
+ last_row=last_row,
314
+ last_col=last_col,
315
+ )
316
+
317
+ else:
318
+ st.error("Unsupported file type.")
319
+ all_dfs = []
320
+
321
+ if not all_dfs:
322
+ st.error("No tables detected or file is empty.")
323
+ else:
324
+ st.success(f"Parsed {len(all_dfs)} table(s).")
325
+ concat_df = normalize_and_concat(all_dfs) # uses your existing function
326
+ st.session_state.concat_df = concat_df
327
+
328
+ # ---------------- 2) Editable Grid ----------------
329
+ st.subheader("2) Edit merged rows (Ag-Grid)")
330
+ if st.session_state.concat_df.empty:
331
+ st.info("Upload and parse a PDF to begin. The merged grid will appear here.")
332
+ else:
333
+ # Work on a copy so we can add a delete flag without mutating the original yet
334
+ df = st.session_state.concat_df.copy()
335
+
336
+ # ➊ Ensure a boolean "delete" column exists (users tick this to mark rows for removal)
337
+ if "delete" not in df.columns:
338
+ df["delete"] = False
339
+
340
+ # βž‹ Build grid (no row selection needed)
341
+ gb = GridOptionsBuilder.from_dataframe(df)
342
+ gb.configure_default_column(editable=True, resizable=True)
343
+ gb.configure_column("_rid", hide=True)
344
+ gb.configure_column("delete", header_name="πŸ—‘ Delete?", editable=True)
345
+ grid_options = gb.build()
346
+
347
+ # ➌ Render editable grid and capture edits
348
+ grid_resp = AgGrid(
349
+ df,
350
+ gridOptions=grid_options,
351
+ update_mode=GridUpdateMode.MODEL_CHANGED, # edits flow back on change
352
+ fit_columns_on_grid_load=True,
353
+ height=420,
354
+ enable_enterprise_modules=False,
355
+ )
356
+
357
+ edited_df = pd.DataFrame(grid_resp["data"])
358
+ # Persist all edits (including delete ticks) to session state
359
+ st.session_state.concat_df = edited_df
360
+
361
+ # ➍ Delete rows that are checked
362
+ colA, colB = st.columns([1, 1])
363
+
364
+ with colA:
365
+ to_delete = edited_df.loc[edited_df.get("delete", False) == True, "_rid"].tolist()
366
+ st.caption(f"Checked for deletion: {len(to_delete)} row(s)")
367
+ if st.button("Delete checked rows", type="primary", disabled=(len(to_delete) == 0)):
368
+ kept = edited_df[~edited_df["_rid"].isin(to_delete)].drop(columns=["delete"], errors="ignore").reset_index(drop=True)
369
+ kept["_rid"] = range(len(kept))
370
+ st.session_state.concat_df = kept
371
+ st.success(f"Deleted {len(to_delete)} row(s).")
372
+
373
+ with colB:
374
+ # πŸ”„ Refresh button β€” forces a rerun
375
+ if st.button("πŸ”„ Refresh table"):
376
+ try:
377
+ st.rerun() # Streamlit β‰₯1.30
378
+ except Exception:
379
+ st.experimental_rerun() # fallback for older versions
380
+
381
+
382
+ # ---------------- 3) Pick Header + Clean ----------------
383
+ st.subheader("3) Pick header row & remove header-like duplicates")
384
+ if st.session_state.concat_df.empty:
385
+ st.info("Header tools will show after parsing a PDF.")
386
+ else:
387
+ df = st.session_state.concat_df
388
+ header_idx = st.number_input("Header row index (0-based)", min_value=0, max_value=len(df)-1, value=0, step=1)
389
+ if st.button("Apply header"):
390
+ df_with_header, header_vals = apply_header_row(df, int(header_idx))
391
+ st.success("Header applied.")
392
+ st.dataframe(df_with_header.head(15), use_container_width=True)
393
+
394
+ st.write("Remove rows similar to header:")
395
+ min_ratio = st.slider("Similarity threshold", 70, 100, 90, 1)
396
+ cleaned = drop_header_like_rows(df_with_header, header_vals, min_ratio=min_ratio)
397
+ st.caption(f"Rows after cleaning: {len(cleaned)}")
398
+ st.dataframe(cleaned.head(60), use_container_width=True)
399
+ print(cleaned)
400
+
401
+ st.download_button(
402
+ "Download cleaned CSV",
403
+ data=cleaned.drop(columns=["_rid"]).to_csv(index=False, encoding="utf-8-sig"),
404
+ file_name="cleaned_tables.csv",
405
+ mime="text/csv",
406
+ )
407
+
408
+ st.caption("Tip: Camelot works best on digital PDFs. For scanned PDFs, consider OCR then table detection.")