| |
| import os |
| import sys |
| import json |
| import math |
| import queue |
| import shutil |
| import logging |
| import tempfile |
| import threading |
| import subprocess |
| import multiprocessing |
| from pathlib import Path |
| from multiprocessing import Pool |
|
|
| |
| import fitz |
| import tkinter as tk |
| from tkinter import filedialog, messagebox |
| from joblib import cpu_count, Parallel, delayed |
|
|
| |
| PARALLEL_THRESHOLD = 16 |
| LINE_TOLERANCE = 1 |
| MIN_RECT_AREA = 1e4 |
|
|
| |
| def cluster_list(xs, tol): |
| """Return list of clusters (each a list) grouped by β€ tol apart.""" |
| xs = sorted(xs) |
| if len(xs) < 2: |
| return [[x] for x in xs] |
| groups, grp = [], [xs[0]] |
| for x in xs[1:]: |
| if x - grp[-1] <= tol: |
| grp.append(x) |
| else: |
| groups.append(grp) |
| grp = [x] |
| groups.append(grp) |
| return groups |
|
|
| def make_cluster_dict(vals, tol): |
| """Map each value to a cluster id (0,1,2,β¦) using tolerance.""" |
| clusters = cluster_list(sorted(set(vals)), tol) |
| mapping = {} |
| for cid, cl in enumerate(clusters): |
| for v in cl: |
| mapping[v] = cid |
| return mapping |
|
|
| |
| def clean_cell_text(text): |
| if not isinstance(text, str): |
| return "" |
| text = text.replace("-\n", "").replace("\n", " ") |
| return " ".join(text.split()) |
|
|
| def safe_join(row): |
| return [clean_cell_text(str(c)) if c is not None else "" for c in row] |
|
|
| def clamp_bbox(bbox, page_rect): |
| x0, y0, x1, y1 = bbox |
| x0 = max(page_rect.x0, min(x0, page_rect.x1)) |
| x1 = max(page_rect.x0, min(x1, page_rect.x1)) |
| y0 = max(page_rect.y0, min(y0, page_rect.y1)) |
| y1 = max(page_rect.y0, min(y1, page_rect.y1)) |
| return (x0, y0, x1, y1) |
|
|
| |
| def detect_table_bboxes(page: fitz.Page, tol=LINE_TOLERANCE): |
| """ |
| Detect table rectangles by: |
| 1. Collecting very thin horizontal & vertical strokes |
| 2. Snapping their positions with tolerance `tol` |
| 3. Forming a grid from unique row & column positions |
| 4. Returning a list[fitz.Rect] for each cell rectangle |
| """ |
| horiz_raw, vert_raw = [], [] |
| for d in page.get_drawings(): |
| if d["type"] != 1: |
| continue |
| x0, y0, x1, y1 = d["bbox"] |
| if abs(y1 - y0) < 2: |
| y_mid = (y0 + y1) / 2 |
| horiz_raw.append((y_mid, x0, x1)) |
| elif abs(x1 - x0) < 2: |
| x_mid = (x0 + x1) / 2 |
| vert_raw.append((x_mid, y0, y1)) |
|
|
| if not horiz_raw or not vert_raw: |
| return [] |
|
|
| row_map = make_cluster_dict([y for y, _, _ in horiz_raw], tol) |
| col_map = make_cluster_dict([x for x, _, _ in vert_raw], tol) |
|
|
| |
| rows = {} |
| for y, x0, x1 in horiz_raw: |
| cid = row_map[y] |
| rows.setdefault(cid, []).append(y) |
| cols = {} |
| for x, y0, y1 in vert_raw: |
| cid = col_map[x] |
| cols.setdefault(cid, []).append(x) |
|
|
| row_pos = sorted(sum(v)/len(v) for v in rows.values()) |
| col_pos = sorted(sum(v)/len(v) for v in cols.values()) |
|
|
| rects = [] |
| for r0, r1 in zip(row_pos, row_pos[1:]): |
| for c0, c1 in zip(col_pos, col_pos[1:]): |
| rect = fitz.Rect(c0, r0, c1, r1) |
| if rect.get_area() >= MIN_RECT_AREA: |
| rects.append(rect) |
|
|
| |
| unique = [] |
| for rect in rects: |
| if not any(u.contains(rect) or rect.contains(u) for u in unique): |
| unique.append(rect) |
|
|
| return unique |
|
|
| |
| def extract_table(page: fitz.Page, table_rect: fitz.Rect): |
| """Group words inside `table_rect` into JSON rows [dict].""" |
| words = [ |
| w for w in page.get_text("words") |
| if table_rect.x0 <= w[0] <= table_rect.x1 |
| and table_rect.y0 <= w[1] <= table_rect.y1 |
| ] |
| words.sort(key=lambda w: (w[1], w[0])) |
|
|
| |
| lines, cury, cur = [], None, [] |
| for w in words: |
| if cury is None or abs(w[1] - cury) > 5: |
| if cur: |
| lines.append(cur) |
| cur = [w] |
| cury = w[1] |
| else: |
| cur.append(w) |
| if cur: |
| lines.append(cur) |
|
|
| if not lines: |
| return [] |
|
|
| line_texts = [" ".join(w[4] for w in ln) for ln in lines] |
| headers = safe_join([line_texts[0]]) |
| rows = [safe_join([lt]) for lt in line_texts[1:]] |
| return [dict(zip(headers, r)) for r in rows] |
|
|
| |
| def process_page(args): |
| page_number, pdf_path = args |
| try: |
| with fitz.open(pdf_path) as doc: |
| page = doc.load_page(page_number) |
| page_rect = page.rect |
| output = f"Page {page_number + 1}\n" |
|
|
| |
| table_rects = detect_table_bboxes(page) |
| table_jsons = [] |
| for rect in table_rects: |
| tbl = extract_table(page, rect) |
| if tbl: |
| table_jsons.append(json.dumps(tbl, indent=1, ensure_ascii=False)) |
|
|
| |
| tbl_boxes = [clamp_bbox(rect, page_rect) for rect in table_rects] |
| words = page.get_text("words") |
| outside = [ |
| w for w in words |
| if not any(b[0] <= w[0] <= b[2] and b[1] <= w[1] <= b[3] for b in tbl_boxes) |
| ] |
| outside.sort(key=lambda w: (w[1], w[0])) |
|
|
| cury, cur, text = None, [], "" |
| for w in outside: |
| if cury is None or abs(w[1] - cury) > 10: |
| if cur: |
| text += " ".join(cur) + "\n" |
| cur, cury = [w[4]], w[1] |
| else: |
| cur.append(w[4]) |
| if cur: |
| text += " ".join(cur) + "\n" |
|
|
| output += text.strip() + "\n" |
| for idx, tbl in enumerate(table_jsons, 1): |
| output += f'"table {idx}":\n{tbl}\n' |
| return page_number, output |
|
|
| except fitz.FileDataError as e: |
| return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): encrypted / unreadable β {e}" |
| except Exception as e: |
| return page_number, f"[ERROR] Page {page_number+1} ({pdf_path}): {e}" |
|
|
| |
| def process_pdf(pdf_path): |
| try: |
| if not os.path.exists(pdf_path): |
| return f"[ERROR] File not found: {pdf_path}" |
|
|
| print(f"[INFO] Starting processing: {pdf_path}") |
| try: |
| with fitz.open(pdf_path) as doc: |
| num_pages = doc.page_count |
| except fitz.FileDataError as e: |
| return f"[ERROR] Cannot open PDF: {pdf_path} β {e}" |
| except Exception as e: |
| return f"[ERROR] General error opening PDF: {pdf_path} β {e}" |
|
|
| pages = [(i, pdf_path) for i in range(num_pages)] |
| results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages) |
|
|
| results.sort(key=lambda x: x[0]) |
| final_output = "\n".join(t for _, t in results) |
|
|
| base = os.path.splitext(os.path.basename(pdf_path))[0] |
| out_dir = os.path.dirname(pdf_path) |
| out_path = os.path.join(out_dir, f"{base}.txt") |
| with open(out_path, "w", encoding="utf-8", errors="ignore") as f: |
| f.write(final_output) |
| print(f"[INFO] Processing complete: {out_path}") |
| except (EOFError, BrokenPipeError, KeyboardInterrupt): |
| return "[INFO] Processing interrupted by user." |
| except Exception as e: |
| return f"[ERROR] Unexpected error with '{pdf_path}': {e}" |
|
|
| def run_serial(pages): return [process_page(a) for a in pages] |
|
|
| def run_parallel(pages): |
| cores = min(max(1, cpu_count() - 2), len(pages)) |
| print(f"Starting parallel processing with {cores} coresβ¦") |
| with Pool(cores) as pool: |
| return pool.map(process_page, pages) |
|
|
| |
| def process_pdfs_main(): |
| pdfs = sys.argv[1:] |
| if not pdfs: |
| print("No PDF files provided.") |
| return |
|
|
| small, large = [], [] |
| for p in pdfs: |
| if not os.path.exists(p): |
| print(f"File not found: {p}") |
| continue |
| try: |
| with fitz.open(p) as doc: |
| (small if doc.page_count <= PARALLEL_THRESHOLD else large).append(p) |
| except fitz.FileDataError: |
| print(f"[ERROR] Password-protected PDF skipped: {p}") |
| except Exception as e: |
| print(f"[ERROR] Error opening {p}: {e}") |
|
|
| if small: |
| cores = min(max(1, cpu_count() - 2), len(small)) |
| print(f"\n[Phase 1] Parallel processing of {len(small)} small PDFs with {cores} cores β¦") |
| for r in Parallel(n_jobs=cores)(delayed(process_pdf)(p) for p in small): |
| print(r) |
|
|
| for p in large: |
| print(f"\n[Phase 2] Processing large PDF: {os.path.basename(p)}") |
| print(process_pdf(p)) |
|
|
| |
| class FileManager: |
| def __init__(self, master): |
| self.master = master |
| master.title("Parser-Sevenof9 β PyMuPDF") |
|
|
| self.files, self.last_selected = [], None |
| tk.Label(master, text="Selected PDF files:").pack(pady=5) |
|
|
| list_frame = tk.Frame(master); list_frame.pack(pady=5) |
| sb_list = tk.Scrollbar(list_frame) |
| self.listbox = tk.Listbox(list_frame, selectmode=tk.MULTIPLE, width=80, height=6, |
| yscrollcommand=sb_list.set) |
| sb_list.config(command=self.listbox.yview) |
| self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_list.pack(side=tk.RIGHT, fill=tk.Y) |
| self.listbox.bind("<<ListboxSelect>>", self.show_text) |
| self.listbox.bind("<Button-1>", self.on_click) |
| self.listbox.bind("<Shift-Button-1>", self.on_shift_click) |
|
|
| self.ctx = tk.Menu(master, tearoff=0) |
| self.ctx.add_command(label="Remove selected", command=self.remove_file) |
| self.listbox.bind("<Button-3>", lambda e: self.ctx.tk_popup(e.x_root, e.y_root) if self.listbox.curselection() else None) |
|
|
| btn_frame = tk.Frame(master); btn_frame.pack(pady=10) |
| tk.Button(btn_frame, text="Add Folder", command=self.add_folder).pack(side=tk.LEFT, padx=5) |
| tk.Button(btn_frame, text="Select Files", command=self.add_file).pack(side=tk.LEFT, padx=5) |
| tk.Button(btn_frame, text="Remove Selected",command=self.remove_file).pack(side=tk.LEFT, padx=5) |
| tk.Button(btn_frame, text="Remove All", command=self.remove_all).pack(side=tk.LEFT, padx=5) |
| tk.Button(master, text="Stop", command=self.stop_parser).pack(pady=5) |
| tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10) |
|
|
| tx_frame = tk.Frame(master); tx_frame.pack(padx=10, pady=5) |
| sb_text = tk.Scrollbar(tx_frame) |
| self.text = tk.Text(tx_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=sb_text.set) |
| sb_text.config(command=self.text.yview) |
| self.text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_text.pack(side=tk.RIGHT, fill=tk.Y) |
|
|
| tk.Label(master, text="Progress:").pack() |
| prog_frame = tk.Frame(master); prog_frame.pack(padx=10, pady=5) |
| sb_prog = tk.Scrollbar(prog_frame) |
| self.prog = tk.Text(prog_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=sb_prog.set) |
| sb_prog.config(command=self.prog.yview) |
| self.prog.pack(side=tk.LEFT, fill=tk.BOTH, expand=True); sb_prog.pack(side=tk.RIGHT, fill=tk.Y) |
|
|
| self.parser_proc = None |
|
|
| |
| def on_click(self, e): |
| idx = self.listbox.nearest(e.y) |
| self.listbox.selection_clear(0, tk.END); self.listbox.selection_set(idx) |
| self.last_selected = idx; self.show_text(None) |
| return "break" |
|
|
| def on_shift_click(self, e): |
| idx = self.listbox.nearest(e.y) |
| if self.last_selected is None: self.last_selected = idx |
| lo, hi = sorted((self.last_selected, idx)) |
| self.listbox.selection_clear(0, tk.END) |
| for i in range(lo, hi+1): self.listbox.selection_set(i) |
| return "break" |
|
|
| |
| def add_folder(self): |
| folder = filedialog.askdirectory(title="Select Folder") |
| if not folder: return |
| for root, _, fs in os.walk(folder): |
| for f in fs: |
| if f.lower().endswith(".pdf"): |
| p = os.path.join(root, f) |
| if p not in self.files: |
| self.files.append(p); self.listbox.insert(tk.END, p) |
|
|
| def add_file(self): |
| for p in filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files","*.pdf")]): |
| if p not in self.files: |
| self.files.append(p); self.listbox.insert(tk.END, p) |
|
|
| def remove_file(self): |
| sel = self.listbox.curselection() |
| if not sel: |
| messagebox.showwarning("Notice","Please select an entry to remove."); return |
| for idx in reversed(sel): |
| self.listbox.delete(idx); del self.files[idx] |
| self.text.delete(1.0, tk.END) |
|
|
| def remove_all(self): |
| self.listbox.delete(0, tk.END); self.files.clear(); self.text.delete(1.0, tk.END) |
|
|
| |
| def start_parser(self): |
| if not self.files: |
| messagebox.showinfo("No Files","Please select at least one file."); return |
| self.prog.config(state=tk.NORMAL); self.prog.delete(1.0, tk.END) |
| self.prog.insert(tk.END,"Starting parserβ¦\n"); self.prog.config(state=tk.DISABLED) |
| threading.Thread(target=self.run_parser).start() |
|
|
| def stop_parser(self): |
| if self.parser_proc and self.parser_proc.poll() is None: |
| self.parser_proc.terminate(); self.append_prog("Parser process was stopped.\n") |
| else: |
| self.append_prog("No active parser process to stop.\n") |
|
|
| def run_parser(self): |
| try: |
| self.parser_proc = subprocess.Popen( |
| [sys.executable, __file__] + self.files, |
| stdout=subprocess.PIPE, stderr=subprocess.STDOUT, |
| text=True, encoding="utf-8", errors="ignore", bufsize=4096 |
| ) |
| for line in self.parser_proc.stdout: |
| self.append_prog(line) |
| self.parser_proc.stdout.close(); self.parser_proc.wait() |
| if self.parser_proc.returncode == 0: |
| self.append_prog("\nParser finished successfully.\n") |
| self.shell_msg("Parser Done","The parser was executed successfully.") |
| else: |
| self.append_prog("\nError while running the parser.\n") |
| self.shell_msg("Error","Error while running the parser.") |
| except Exception as e: |
| self.append_prog(f"Error: {e}\n"); self.shell_msg("Error",f"Execution error:\n{e}") |
| finally: |
| self.parser_proc = None |
|
|
| |
| def append_prog(self, txt): |
| self.prog.after(0, lambda:self._ins(txt)) |
|
|
| def _ins(self, txt): |
| self.prog.config(state=tk.NORMAL); self.prog.insert(tk.END, txt) |
| self.prog.see(tk.END); self.prog.config(state=tk.DISABLED) |
|
|
| def shell_msg(self, title, msg): |
| self.master.after(0, lambda: messagebox.showinfo(title, msg)) |
|
|
| def show_text(self, _): |
| sel = self.listbox.curselection() |
| if not sel: return |
| path = self.files[sel[0]] |
| txt = os.path.splitext(path)[0] + ".txt" |
| self.text.delete(1.0, tk.END) |
| if os.path.exists(txt): |
| try: |
| with open(txt,"r",encoding="utf-8",errors="ignore") as f: |
| self.text.insert(tk.END, f.read()) |
| except Exception as e: |
| self.text.insert(tk.END,f"Error loading text file:\n{e}") |
| else: |
| self.text.insert(tk.END,"[No corresponding .txt file found]") |
|
|
| |
| if __name__ == "__main__": |
| multiprocessing.freeze_support() |
| if len(sys.argv) > 1: |
| process_pdfs_main() |
| else: |
| root = tk.Tk(); FileManager(root); root.mainloop() |
|
|