kalle07 commited on
Commit
ccefb19
·
verified ·
1 Parent(s): b1488da

Upload 2 files

Browse files
PDF Parser - Sevenof9_v7d.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65bb12ae8222d93ebe828597374b6c07c606050b2c5073c12478ceff9e0a024f
3
+ size 42943471
PDF Parser - Sevenof9_v7d.py ADDED
@@ -0,0 +1,873 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import json
5
+ import wx
6
+ import re
7
+ import platform
8
+ import subprocess
9
+ import threading
10
+ import concurrent.futures
11
+ import multiprocessing
12
+ from concurrent.futures import ProcessPoolExecutor
13
+ import pdfplumber
14
+ import psutil
15
+ import logging
16
+ from pdfminer.pdfparser import PDFParser, PDFSyntaxError
17
+ from pdfminer.pdfdocument import PDFDocument, PDFEncryptionError, PDFPasswordIncorrect
18
+ from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
19
+ from pdfminer.pdfinterp import PDFResourceManager
20
+
21
+
22
+ # -------------------- Konfiguration --------------------
23
+ PARALLEL_THRESHOLD = 16
24
+
25
+ TEXT_EXTRACTION_SETTINGS = {
26
+ "x_tolerance": 1.5,
27
+ "y_tolerance": 2.5,
28
+ "keep_blank_chars": False,
29
+ "use_text_flow": False,
30
+ }
31
+
32
+
33
+
34
+ # GUi update intervall
35
+ def throttle_callback(callback, interval_ms=1):
36
+ last_called = 0
37
+
38
+ def wrapper(status):
39
+ nonlocal last_called
40
+ now = time.time() * 1000 # Zeit in ms
41
+ if now - last_called >= interval_ms:
42
+ last_called = now
43
+ callback(status)
44
+ return wrapper
45
+
46
+
47
+
48
+ # Function to suppress PDFMiner logging, reducing verbosity
49
+ def suppress_pdfminer_logging():
50
+ for logger_name in [
51
+ "pdfminer", # Various pdfminer modules to suppress logging from
52
+ "pdfminer.pdfparser",
53
+ "pdfminer.pdfdocument",
54
+ "pdfminer.pdfpage",
55
+ "pdfminer.converter",
56
+ "pdfminer.layout",
57
+ "pdfminer.cmapdb",
58
+ "pdfminer.utils"
59
+ ]:
60
+ logging.getLogger(logger_name).setLevel(logging.ERROR) # Set logging level to ERROR to suppress lower levels
61
+
62
+
63
+ EUROPEAN_PRINTABLES_PATTERN = re.compile(r"[^\u0000-\uFFFF]", re.DOTALL)
64
+ CID_PATTERN = re.compile(r"\(cid:\d+\)")
65
+
66
+ def clean_cell_text(text):
67
+ if not isinstance(text, str):
68
+ return ""
69
+ text = text.replace("-\n", "").replace("\n", " ")
70
+ text = CID_PATTERN.sub("", text)
71
+ return EUROPEAN_PRINTABLES_PATTERN.sub("", text)
72
+
73
+ def clamp_bbox(bbox, page_width, page_height, p=3):
74
+ x0, top, x1, bottom = bbox
75
+ x0 = max(0, min(x0, page_width))
76
+ x1 = max(0, min(x1, page_width))
77
+ top = max(0, min(top, page_height))
78
+ bottom = max(0, min(bottom, page_height))
79
+ return round(x0, p), round(top, p), round(x1, p), round(bottom, p)
80
+
81
+ def get_physical_cores():
82
+ count = psutil.cpu_count(logical=False)
83
+ return max(1, count if count else 1) # fallback = 1
84
+ cores = get_physical_cores()
85
+
86
+
87
+ def is_valid_cell(cell):
88
+ """Prüft, ob eine Zelle mehr als nur Leerzeichen oder ein einzelnes Zeichen enthält."""
89
+ if cell is None:
90
+ return False
91
+ content = str(cell).strip()
92
+ return len(content) > 1
93
+
94
+
95
+ def block_area(block):
96
+ x0 = min(w["x0"] for w in block)
97
+ x1 = max(w["x1"] for w in block)
98
+ top = min(w["top"] for w in block)
99
+ bottom = max(w["bottom"] for w in block)
100
+ return (x1 - x0) * (bottom - top)
101
+
102
+
103
+ suppress_pdfminer_logging()
104
+
105
+ # -------------------- Status-Tracking --------------------
106
+ class StatusTracker:
107
+ def __init__(self, total_pages):
108
+ self.start_time = time.time()
109
+ self.total_pages = total_pages
110
+ self.processed_pages = 0
111
+
112
+ def update(self, n=1):
113
+ self.processed_pages += n
114
+
115
+ def get_status(self):
116
+ elapsed = time.time() - self.start_time
117
+ pages_per_sec = round(self.processed_pages / elapsed) if elapsed > 0 else 0
118
+ remaining_pages = self.total_pages - self.processed_pages
119
+ est_time = (remaining_pages / pages_per_sec) / 60 if pages_per_sec > 0 else float('inf')
120
+ return {
121
+ "processed_pages": self.processed_pages,
122
+ "total_pages": self.total_pages,
123
+ "pages_per_sec": pages_per_sec,
124
+ "elapsed_time": round(elapsed / 60, 1),
125
+ "est_time": round(est_time, 1)
126
+ }
127
+
128
+
129
+ # -------------------- PDF Verarbeitung --------------------
130
+ def process_page_worker(args):
131
+ suppress_pdfminer_logging()
132
+ try:
133
+ page_number, path = args
134
+ with pdfplumber.open(path) as pdf:
135
+ page = pdf.pages[page_number]
136
+ width, height = page.width, page.height
137
+
138
+ # Unabhängige Ränder definieren (z. B. 4 % links/rechts, 5 % oben, 7 % unten)
139
+ margin_x_percent = 0.06
140
+ top_margin_percent = 0.06
141
+ bottom_margin_percent = 0.04
142
+
143
+ margin_x = width * margin_x_percent
144
+ top_margin = height * top_margin_percent
145
+ bottom_margin = height * bottom_margin_percent
146
+
147
+ # crop(left, top, right, bottom)
148
+ cropped_page = page.crop((
149
+ margin_x,
150
+ top_margin,
151
+ width - margin_x,
152
+ height - bottom_margin
153
+ ))
154
+ '''
155
+ dpi = 150 # gleiche Auflösung wie to_image
156
+ pixel_per_point = dpi / 85.5
157
+
158
+ # Originalgröße in Punkten
159
+ print(f"Originalgröße: {width:.2f}pt x {height:.2f}pt")
160
+ print(f"Originalgröße: {width * pixel_per_point:.0f}px x {height * pixel_per_point:.0f}px")
161
+
162
+ # Cropped-Größe berechnen
163
+ cropped_width = width - 2 * margin_x
164
+ cropped_height = height - top_margin - bottom_margin
165
+
166
+ print(f"Cropped-Größe: {cropped_width:.2f}pt x {cropped_height:.2f}pt")
167
+ print(f"Cropped-Größe: {cropped_width * pixel_per_point:.0f}px x {cropped_height * pixel_per_point:.0f}px")
168
+ '''
169
+ #margin_x, margin_y = width * 0.04, height * 0.04
170
+
171
+ #cropped_page = page.crop((margin_x, margin_y, width - margin_x, height - margin_y))
172
+ table_bboxes = [clamp_bbox(t.bbox, width, height) for t in cropped_page.find_tables()]
173
+ extracted_tables = cropped_page.extract_tables({"text_x_tolerance": 1.5})
174
+ tables_json = []
175
+
176
+ for raw_table in extracted_tables:
177
+ if not raw_table or len(raw_table) < 2:
178
+ continue # Weniger als 2 Zeilen
179
+
180
+ # Prüfe auf mindestens 2 Spalten
181
+ if all(len(row) < 2 for row in raw_table if row):
182
+ continue
183
+
184
+ # Leere oder fast leere Tabellen (nur Leerzeichen oder 1 Zeichen pro Zelle) ausschließen
185
+ if all(all(not is_valid_cell(cell) for cell in row) for row in raw_table):
186
+ continue
187
+
188
+ cleaned_table = [[clean_cell_text(c) for c in row] for row in raw_table]
189
+ header_row = cleaned_table[0]
190
+ is_corner_empty = header_row[0].strip() == ""
191
+
192
+ if is_corner_empty:
193
+ col_headers = cleaned_table[0][1:]
194
+ row_headers = [row[0] for row in cleaned_table[1:]]
195
+ data_rows = cleaned_table[1:]
196
+
197
+ table_data = {}
198
+ for row_header, row in zip(row_headers, data_rows):
199
+ row_dict = {}
200
+ for col_header, cell in zip(col_headers, row[1:]):
201
+ row_dict[col_header] = cell
202
+ table_data[row_header] = row_dict
203
+ else:
204
+ headers = header_row
205
+ data_rows = cleaned_table[1:]
206
+ table_data = []
207
+ for row in data_rows:
208
+ if len(row) == len(headers):
209
+ table_data.append(dict(zip(headers, row)))
210
+
211
+ tables_json.append(json.dumps(table_data, indent=1, ensure_ascii=False))
212
+
213
+
214
+ words = []
215
+ for w in cropped_page.extract_words(**TEXT_EXTRACTION_SETTINGS):
216
+ x0, top = float(w["x0"]), float(w["top"])
217
+ if any(bx0 <= x0 <= bx2 and by0 <= top <= by3 for bx0, by0, bx2, by3 in table_bboxes):
218
+ continue
219
+ if EUROPEAN_PRINTABLES_PATTERN.search(w["text"]):
220
+ continue
221
+ words.append(w)
222
+
223
+ def is_bold(fontname: str) -> bool:
224
+ fontname = fontname.lower()
225
+ return "bold" in fontname or "bd" in fontname or "black" in fontname
226
+
227
+ word_info = []
228
+ font_sizes = []
229
+ for w in words:
230
+ x0 = float(w["x0"])
231
+ x1 = float(w["x1"])
232
+ top = float(w["top"])
233
+ bottom = float(w["bottom"])
234
+ text = w["text"]
235
+ #cropped_chars = cropped_page.chars
236
+
237
+ chars = [c for c in cropped_page.chars if x0 <= float(c["x0"]) <= x1 and top <= float(c["top"]) <= bottom]
238
+ sizes = [float(c.get("size", 0)) for c in chars if c.get("text", "").strip()]
239
+ fonts = [c.get("fontname", "") for c in chars]
240
+ bold_flags = [is_bold(c.get("fontname", "")) for c in chars]
241
+
242
+ font_size = max(sizes) if sizes else 0
243
+ font_sizes.append(font_size)
244
+ font_name = fonts[0] if fonts else "Unknown"
245
+ bold_flag = any(bold_flags)
246
+
247
+ word_info.append({
248
+ "text": text,
249
+ "top": round(top, 1),
250
+ "bottom": round(bottom, 1),
251
+ "font_size": font_size,
252
+ "font_name": font_name,
253
+ "bold_flag": bold_flag,
254
+ "x0": round(x0, 1),
255
+ "x1": round(x1, 1),
256
+ })
257
+
258
+
259
+
260
+ avg_fontsize = sum(font_sizes) / len(font_sizes) if font_sizes else 0
261
+
262
+ # Abstandsschwellen
263
+ MAX_DIST_X = 12
264
+ MAX_DIST_Y = 10
265
+
266
+ def are_words_close(w1, w2):
267
+ # Prüfe, ob Wörter räumlich nah beieinander liegen
268
+ dx = max(0, max(w1["x0"], w2["x0"]) - min(w1["x1"], w2["x1"]))
269
+ dy = max(0, max(w1["top"], w2["top"]) - min(w1["bottom"], w2["bottom"]))
270
+ return dx <= MAX_DIST_X and dy <= MAX_DIST_Y
271
+
272
+ def group_into_blocks(words):
273
+ blocks = []
274
+ unvisited = set(range(len(words)))
275
+ while unvisited:
276
+ idx = unvisited.pop()
277
+ block = {idx}
278
+ to_visit = {idx}
279
+ while to_visit:
280
+ current = to_visit.pop()
281
+ for other in list(unvisited):
282
+ if are_words_close(words[current], words[other]):
283
+ block.add(other)
284
+ to_visit.add(other)
285
+ unvisited.remove(other)
286
+ blocks.append([words[i] for i in block])
287
+ return blocks
288
+
289
+ def group_block_into_lines(block, line_tolerance=2.5):
290
+ # Gruppiere Wörter innerhalb eines Blocks in Zeilen (nach Y-Koordinate)
291
+ sorted_words = sorted(block, key=lambda w: w["top"])
292
+ lines = []
293
+ #lines = [sorted(block, key=lambda w: w["x0"])]
294
+ current_line = [sorted_words[0]]
295
+ current_top = sorted_words[0]["top"]
296
+
297
+ for word in sorted_words[1:]:
298
+ if abs(word["top"] - current_top) <= line_tolerance:
299
+ current_line.append(word)
300
+ else:
301
+ lines.append(sorted(current_line, key=lambda w: w["x0"]))
302
+ current_line = [word]
303
+ current_top = word["top"]
304
+ if current_line:
305
+ lines.append(sorted(current_line, key=lambda w: w["x0"]))
306
+ return lines
307
+
308
+
309
+ blocks = group_into_blocks(word_info)
310
+
311
+ SORT_TOLERANCE = 1 # e.g. 1 point distance
312
+
313
+ def round_to_nearest(value, tolerance):
314
+ return round(value / tolerance) * tolerance
315
+
316
+ def get_block_reference(block):
317
+ min_x0 = min(w["x0"] for w in block)
318
+ min_top = min(w["top"] for w in block)
319
+ return (
320
+ round_to_nearest(min_x0, SORT_TOLERANCE),
321
+ round_to_nearest(min_top, SORT_TOLERANCE),
322
+ )
323
+
324
+ # Sort blocks first by x0, then by top (row beginning)
325
+ sorted_blocks = sorted(blocks, key=get_block_reference)
326
+
327
+ '''
328
+ # Visualisierung: Blocks als Rechtecke zeichnen
329
+ im = page.to_image(resolution=150) # ggf. Auflösung anpassen
330
+
331
+ # Zeichne roten Rahmen für die Cropped-Region
332
+ im.draw_rect(
333
+ (
334
+ margin_x,
335
+ top_margin,
336
+ width - margin_x,
337
+ height - bottom_margin
338
+ ),
339
+ stroke="red",
340
+ stroke_width=2
341
+ )
342
+
343
+ for block in blocks:
344
+ # Grenzen berechnen
345
+ x0 = min(w["x0"] for w in block)
346
+ top = min(w["top"] for w in block)
347
+ x1 = max(w["x1"] for w in block)
348
+ bottom = max(w["bottom"] for w in block)
349
+
350
+ # Rechteck zeichnen (blauer Rahmen, Dicke 1)
351
+ im.draw_rect((x0, top, x1, bottom), stroke="blue", stroke_width=1)
352
+
353
+ # Bild speichern – Dateiname z. B. mit Seitenzahl
354
+ im.save(f"page_{page_number + 1}_blocks.png")
355
+ '''
356
+
357
+ output_lines = []
358
+ #output_lines.append(f"\nPage {page_number + 1}, Seite {page_number + 1}, Página {page_number + 1}\n") # Seitenzahl
359
+
360
+ for block_idx, block in enumerate(sorted_blocks, 1):
361
+ lines = group_block_into_lines(block)
362
+
363
+ chapter_hits = 0
364
+ important_hits = 0
365
+ block_label = None # Initialisierung hier
366
+
367
+ # Regel 1: Nur Wörter mit mehr als 3 Zeichen und keine reinen Zahlen
368
+ for w in block:
369
+ text = w["text"]
370
+ if len(text) <= 5 or text.isdigit():
371
+ continue # Regel 1 – alle anderen Regeln überspringen
372
+
373
+ size_ratio = w["font_size"] / avg_fontsize if avg_fontsize else 0
374
+ bold_flag = w["bold_flag"]
375
+
376
+ # Regel 2 – Vorrangig
377
+ if size_ratio >= 1.15:
378
+ chapter_hits += 1
379
+ # Regel 3 – Wenn Regel 2 nicht greift
380
+ elif bold_flag and size_ratio >= 1:
381
+ important_hits += 1
382
+
383
+ total_hits = chapter_hits + important_hits
384
+
385
+ # Regel 4 – Entscheidung auf Basis der Anzahl Treffer
386
+ if total_hits > 1:
387
+ block_label = "IMPORTANT"
388
+ elif total_hits == 1:
389
+ if chapter_hits == 1:
390
+ block_label = "CHAPTER"
391
+ elif important_hits == 1:
392
+ block_label = "IMPORTANT"
393
+
394
+ output_lines.append("") # Leerzeile vor Block
395
+
396
+ for line_idx, line in enumerate(lines):
397
+ line_text = " ".join(w["text"] for w in line)
398
+ if line_idx == 0 and block_label:
399
+ line_text = f"[{block_label}] {line_text}"
400
+ output_lines.append(line_text)
401
+
402
+
403
+
404
+ # Tabellen anhängen (wie gehabt)
405
+ for idx, tbl in enumerate(tables_json, 1):
406
+ output_lines.append(f'"table {idx}":\n{tbl}')
407
+
408
+ return page_number, "\n".join(output_lines)
409
+
410
+
411
+ except Exception as e:
412
+ msg = str(e).strip() or f"{type(e).__name__} (no message)"
413
+ return args[0], f"[ERROR] Seite {args[0]+1}: {msg}"
414
+
415
+
416
+
417
+ def run_serial(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
418
+ results = []
419
+ for i in range(page_number):
420
+ if stop_flag and stop_flag.is_set():
421
+ break
422
+ result = process_page_worker((i, path,))
423
+ results.append(result)
424
+ if tracker is not None:
425
+ tracker.update()
426
+ if progress_callback and tracker is not None:
427
+ report_status(tracker, progress_callback)
428
+ return results
429
+
430
+
431
+
432
+
433
+ def run_parallel(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
434
+ args = [(i, path) for i in range(page_number)] # stop_flag entfernt
435
+ results = [None] * page_number
436
+
437
+ def callback(result):
438
+ if result is None:
439
+ return
440
+ page, _ = result
441
+ results[page] = result
442
+ if tracker is not None:
443
+ tracker.update()
444
+ if progress_callback and tracker is not None:
445
+ report_status(tracker, progress_callback)
446
+
447
+ with concurrent.futures.ProcessPoolExecutor(
448
+ max_workers=min(page_number, get_physical_cores())
449
+ ) as executor:
450
+ futures = {executor.submit(process_page_worker, arg): arg for arg in args}
451
+ for future in concurrent.futures.as_completed(futures):
452
+ # stop_flag nicht hier prüfen, sondern im Hauptthread
453
+ callback(future.result())
454
+
455
+ return [r for r in results if r]
456
+
457
+
458
+ def report_status(tracker, progress_callback=None):
459
+ status = tracker.get_status()
460
+ if progress_callback:
461
+ progress_callback(status)
462
+ else:
463
+ print(f"[STATUS] {status['processed_pages']}/{status['total_pages']} Seiten "
464
+ f"({status['pages_per_sec']:} Seiten/s, "
465
+ f"Elapsed: {status['elapsed_time']} Sek.)"
466
+ f"Est Time: {status['est_time']} Sek.)")
467
+
468
+
469
+ def save_pdf(path, page_number, tracker=None, parallel=False, progress_callback=None, stop_flag=None):
470
+ if stop_flag and stop_flag.is_set():
471
+ return 0
472
+
473
+ if parallel:
474
+ results = run_parallel(path, page_number, tracker, progress_callback, stop_flag)
475
+ else:
476
+ results = run_serial(path, page_number, tracker, progress_callback, stop_flag)
477
+
478
+ results = [r for r in results if r] # Filter None (bei Stop)
479
+
480
+ results.sort(key=lambda x: x[0])
481
+ text_output = "\n".join(text for _, text in results)
482
+
483
+ out_path = os.path.splitext(path)[0] + ".txt"
484
+ with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
485
+ f.write(text_output)
486
+
487
+ return page_number
488
+
489
+
490
+
491
+ def _process_single_pdf(path):
492
+ suppress_pdfminer_logging()
493
+ try:
494
+ with open(path, "rb") as f:
495
+ parser = PDFParser(f)
496
+ document = PDFDocument(parser)
497
+
498
+ if not document.is_extractable:
499
+ raise PDFTextExtractionNotAllowed("Text-Extraktion nicht erlaubt")
500
+
501
+ pages = list(PDFPage.create_pages(document))
502
+ return (path, len(pages), None)
503
+
504
+ except (PDFEncryptionError, PDFPasswordIncorrect) as e:
505
+ return (path, 0, f"[ERROR] Datei passwortgeschützt: {path} ({type(e).__name__}: {e})\n")
506
+ except PDFSyntaxError as e:
507
+ return (path, 0, f"[ERROR] Ungültige PDF-Syntax: {path} ({type(e).__name__}: {e})\n")
508
+ except PDFTextExtractionNotAllowed as e:
509
+ return (path, 0, f"[ERROR] Text-Extraktion nicht erlaubt: {path} ({type(e).__name__}: {e})\n")
510
+ except Exception as e:
511
+ return (path, 0, f"[ERROR] Fehler bei Datei {path}: {type(e).__name__}: {e}\n")
512
+
513
+ def get_total_pages(pdf_files, error_callback=None, progress_callback=None):
514
+ suppress_pdfminer_logging()
515
+ total = 0
516
+ page_info = []
517
+
518
+ def handle_result(path, count, error):
519
+ nonlocal total
520
+ if error:
521
+ if error_callback:
522
+ error_callback(error)
523
+ else:
524
+ print(error, end="")
525
+ else:
526
+ page_info.append((path, count))
527
+ total += count
528
+ if progress_callback:
529
+ progress_callback(total) # Rückmeldung an GUI
530
+
531
+ if len(pdf_files) > 14:
532
+ with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor:
533
+ results = executor.map(_process_single_pdf, pdf_files)
534
+ for path, count, error in results:
535
+ handle_result(path, count, error)
536
+ else:
537
+ for path in pdf_files:
538
+ path, count, error = _process_single_pdf(path)
539
+ handle_result(path, count, error)
540
+
541
+ return page_info, total
542
+
543
+
544
+
545
+
546
+ # -------------------- GUI --------------------
547
+ class FileManager(wx.Frame):
548
+ def __init__(self, parent):
549
+ super().__init__(parent, title="PDF Parser - Sevenof9_v7d", size=(1000, 800))
550
+ self.files = []
551
+ self.InitUI()
552
+ self.stop_flag = threading.Event()
553
+
554
+ def InitUI(self):
555
+ panel = wx.Panel(self)
556
+ vbox = wx.BoxSizer(wx.VERTICAL)
557
+
558
+ hbox_lbl1 = wx.BoxSizer(wx.HORIZONTAL)
559
+
560
+ lbl1 = wx.StaticText(panel, label="Filed PDF files: (with right mouse you can remove and open)")
561
+ hbox_lbl1.Add(lbl1, flag=wx.ALIGN_CENTER_VERTICAL | wx.LEFT, border=10)
562
+
563
+ hbox_lbl1.AddStretchSpacer() # <== schiebt den Button ganz nach rechts
564
+
565
+ help_btn = wx.Button(panel, label="? HELP ?", size=(60, 25))
566
+ help_btn.Bind(wx.EVT_BUTTON, self.ShowHelpText)
567
+ hbox_lbl1.Add(help_btn, flag=wx.RIGHT, border=10)
568
+
569
+ vbox.Add(hbox_lbl1, flag=wx.EXPAND | wx.TOP, border=10)
570
+
571
+
572
+ self.listbox = wx.ListBox(panel, style=wx.LB_EXTENDED)
573
+ self.listbox.Bind(wx.EVT_RIGHT_DOWN, self.OnRightClick)
574
+ self.listbox.Bind(wx.EVT_LISTBOX, self.ShowText)
575
+ vbox.Add(self.listbox, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
576
+
577
+ self.popup_menu = wx.Menu()
578
+ self.popup_menu.Append(1, "Remove selected")
579
+ self.popup_menu.Append(2, "Open in default PDF app")
580
+ self.popup_menu.Append(3, "Copy File Location")
581
+ self.popup_menu.Append(4, "Open File Location")
582
+ self.Bind(wx.EVT_MENU, self.RemoveFile, id=1)
583
+ self.Bind(wx.EVT_MENU, self.OpenPDF, id=2)
584
+ self.Bind(wx.EVT_MENU, self.CopyFileLocation, id=3)
585
+ self.Bind(wx.EVT_MENU, self.OpenFileLocation, id=4)
586
+
587
+
588
+ btn_panel = wx.Panel(panel)
589
+ btn_sizer = wx.BoxSizer(wx.HORIZONTAL)
590
+ for label, handler in [
591
+ ("Add Folder", self.AddFolder),
592
+ ("Select Files", self.AddFile),
593
+ ("Remove Selected", self.RemoveFile),
594
+ ("Remove All", self.RemoveAll),
595
+ ("Stop Parser", self.StopParser),
596
+ ("Start Parser", self.StartParser)
597
+ ]:
598
+ btn = wx.Button(btn_panel, label=label)
599
+ btn.Bind(wx.EVT_BUTTON, handler)
600
+ if label == "Start Parser":
601
+ self.start_btn = btn # <-- Referenz merken
602
+ btn_sizer.Add(btn, proportion=1, flag=wx.ALL, border=5)
603
+ btn_panel.SetSizer(btn_sizer)
604
+ vbox.Add(btn_panel, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
605
+
606
+
607
+ lbl2 = wx.StaticText(panel, label="Text Frame: (choose PDF to see converted text)")
608
+ vbox.Add(lbl2, flag=wx.LEFT, border=10)
609
+
610
+ self.text_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
611
+ self.ShowHelpText(None)
612
+ vbox.Add(self.text_ctrl, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
613
+
614
+ # Statusanzeige
615
+ stat_grid = wx.FlexGridSizer(1, 5, 5, 55)
616
+ self.lbl_processed_pages = wx.StaticText(panel, label="Processed pages: 0")
617
+ self.lbl_total_pages = wx.StaticText(panel, label="Total pages: 0")
618
+ self.lbl_pages_per_sec = wx.StaticText(panel, label="Pages/sec: 0")
619
+ self.lbl_est_time = wx.StaticText(panel, label="Estimated time (min): 0.0")
620
+ self.lbl_elapsed_time = wx.StaticText(panel, label="Elapsed time: 0.0")
621
+
622
+ for lbl in [self.lbl_processed_pages, self.lbl_total_pages, self.lbl_pages_per_sec, self.lbl_est_time, self.lbl_elapsed_time]:
623
+ stat_grid.Add(lbl)
624
+ vbox.Add(stat_grid, flag=wx.LEFT | wx.TOP, border=10)
625
+
626
+ self.prog_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
627
+ vbox.Add(self.prog_ctrl, proportion=1, flag=wx.EXPAND | wx.ALL, border=10)
628
+
629
+ panel.SetSizer(vbox)
630
+
631
+
632
+ def ShowHelpText(self, event):
633
+ help_text = (
634
+ " This is a small help\n\n"
635
+ " • PRE ALPHA version (for ever) •\n"
636
+ "• The generated TXT file has the same name as the PDF file\n"
637
+ "• The TXT file is created in the same directory as the PDF\n"
638
+ "• Older TXT files will be overwritten without prompting\n"
639
+ "• When selecting a folder, subfolders are also selected\n"
640
+ "If:\n"
641
+ "[INFO] File completed: TEST.pdf (X pages)!\n"
642
+ "[INFO] Processing completed\n"
643
+ "-> This only means that all pages have been processed; it does not mean that the quality is good.\n"
644
+ "• An attempt is made to reproduce the layout of the page in columns from left to right and in blocks from top to bottom\n"
645
+ "• An attempt is made to detect regular tables with lines; headers (top or top and left) are assigned to the cells and stored in JSON format in the text file\n"
646
+ "• Adds the label “Page X” at the beginning of every page (absdlute number)\n"
647
+ "• Adds the label “Chapter” for large font and/or “important” for bold font\n"
648
+ "\n"
649
+ "Stop function becomes effective only after the currently processed file\n"
650
+ "When processing large amounts of data, the following should be noted:\n"
651
+ "First, all PDFs are opened once to determine the number of pages:\n"
652
+ "Then, all small PDFs are processed in parallel:\n"
653
+ "Then, each large PDF is processed page by page in parallel:\n"
654
+ )
655
+ self.text_ctrl.SetValue(help_text)
656
+
657
+
658
+ def AddFolder(self, event):
659
+ dlg = wx.DirDialog(self, "Select Folder")
660
+ if dlg.ShowModal() == wx.ID_OK:
661
+ for root, _, files in os.walk(dlg.GetPath()):
662
+ for f in files:
663
+ if f.lower().endswith(".pdf"):
664
+ path = os.path.normpath(os.path.join(root, f))
665
+ if path not in self.files:
666
+ self.files.append(path)
667
+ self.listbox.Append(path)
668
+ dlg.Destroy()
669
+
670
+ def AddFile(self, event):
671
+ with wx.FileDialog(self, "Select PDF Files", wildcard="PDF files (*.pdf)|*.pdf",
672
+ style=wx.FD_OPEN | wx.FD_MULTIPLE) as dlg:
673
+ if dlg.ShowModal() == wx.ID_OK:
674
+ for path in dlg.GetPaths():
675
+ if path not in self.files:
676
+ self.files.append(path)
677
+ self.listbox.Append(path)
678
+
679
+ def RemoveFile(self, event):
680
+ for i in reversed(self.listbox.GetSelections()):
681
+ self.listbox.Delete(i)
682
+ del self.files[i]
683
+ self.text_ctrl.Clear()
684
+
685
+ def RemoveAll(self, event):
686
+ self.listbox.Clear()
687
+ self.files.clear()
688
+ self.text_ctrl.Clear()
689
+
690
+ def OpenPDF(self, event):
691
+ i = self.listbox.GetSelections()
692
+ if i:
693
+ path = self.files[i[0]]
694
+ if platform.system() == "Windows":
695
+ os.startfile(path)
696
+ elif platform.system() == "Darwin":
697
+ subprocess.call(["open", path])
698
+ else:
699
+ subprocess.call(["xdg-open", path])
700
+
701
+ def CopyFileLocation(self, event):
702
+ sel = self.listbox.GetSelections()
703
+ if sel:
704
+ path = self.files[sel[0]]
705
+ if wx.TheClipboard.Open():
706
+ wx.TheClipboard.SetData(wx.TextDataObject(path))
707
+ wx.TheClipboard.Close()
708
+
709
+ def OpenFileLocation(self, event):
710
+ sel = self.listbox.GetSelections()
711
+ if sel:
712
+ folder = os.path.dirname(self.files[sel[0]])
713
+ if platform.system() == "Windows":
714
+ subprocess.Popen(f'explorer "{folder}"')
715
+ elif platform.system() == "Darwin":
716
+ subprocess.call(["open", folder])
717
+ else:
718
+ subprocess.call(["xdg-open", folder])
719
+
720
+
721
+ def OnRightClick(self, event):
722
+ if self.listbox.GetSelections():
723
+ self.PopupMenu(self.popup_menu, event.GetPosition())
724
+
725
+ def StartParser(self, event):
726
+ if not self.files:
727
+ wx.MessageBox("Please select files first.", "Hinweis", wx.OK | wx.ICON_INFORMATION)
728
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
729
+ return
730
+
731
+
732
+ self.start_btn.Disable()
733
+ self.stop_flag.clear()
734
+ self.prog_ctrl.Clear()
735
+
736
+ def error_callback(msg):
737
+ wx.CallAfter(self.AppendProg, msg)
738
+
739
+ def update_total_pages_live(new_total):
740
+ wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {new_total}")
741
+
742
+
743
+ page_info, total_pages = get_total_pages(
744
+ self.files,
745
+ error_callback=error_callback,
746
+ progress_callback=update_total_pages_live
747
+ )
748
+
749
+ if total_pages == 0:
750
+ self.AppendProg("[INFO] No pages found.\n")
751
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
752
+ return
753
+
754
+ tracker = StatusTracker(total_pages)
755
+
756
+ def gui_progress_callback(status):
757
+ wx.CallAfter(self.lbl_processed_pages.SetLabel, f"Processed pages: {status['processed_pages']}")
758
+ wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {status['total_pages']}")
759
+ wx.CallAfter(self.lbl_pages_per_sec.SetLabel, f"Pages/sec: {status['pages_per_sec']:}")
760
+ wx.CallAfter(self.lbl_est_time.SetLabel, f"Estimated time (min): {status['est_time']:}")
761
+ wx.CallAfter(self.lbl_elapsed_time.SetLabel, f"Elapsed time: {status['elapsed_time']}")
762
+
763
+ throttled_gui_callback = throttle_callback(gui_progress_callback, 100)
764
+
765
+ def background():
766
+ small = [p for p in page_info if p[1] <= PARALLEL_THRESHOLD]
767
+ large = [p for p in page_info if p[1] > PARALLEL_THRESHOLD]
768
+
769
+ # Verarbeite kleine Dateien je in einem eigenen Prozess
770
+ if small:
771
+ max_workers = max(1, min(len(small), get_physical_cores()))
772
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
773
+ futures = {}
774
+ for path, count in small:
775
+ if self.stop_flag.is_set():
776
+ break
777
+ future = executor.submit(save_pdf, path, count, None, False, None)
778
+ futures[future] = (path, count)
779
+
780
+ for future in concurrent.futures.as_completed(futures):
781
+ if self.stop_flag.is_set():
782
+ break
783
+ path, count = futures[future]
784
+ try:
785
+ pages_processed = future.result()
786
+ tracker.update(pages_processed)
787
+ throttled_gui_callback(tracker.get_status())
788
+ wx.CallAfter(self.AppendProg, f"[INFO] File ready: {path} ({pages_processed} Seiten)\n")
789
+ except Exception as e:
790
+ wx.CallAfter(self.AppendProg, f"[ERROR] File {path}: {str(e)}\n")
791
+
792
+ # Verarbeite große Dateien Seite für Seite parallel
793
+ for path, count in large:
794
+ if self.stop_flag.is_set():
795
+ break
796
+
797
+ try:
798
+ pages_processed = save_pdf(
799
+ path,
800
+ count,
801
+ tracker,
802
+ parallel=True,
803
+ progress_callback=throttled_gui_callback,
804
+ stop_flag=self.stop_flag
805
+ )
806
+ if pages_processed:
807
+ wx.CallAfter(
808
+ self.AppendProg,
809
+ f"[INFO] File ready: {path} ({pages_processed} Seiten)\n"
810
+ )
811
+ else:
812
+ wx.CallAfter(
813
+ self.AppendProg,
814
+ f"[INFO] Stopped: {path}\n"
815
+ )
816
+ except Exception as e:
817
+ wx.CallAfter(
818
+ self.AppendProg,
819
+ f"[ERROR] File {path}: {str(e)}\n"
820
+ )
821
+
822
+
823
+
824
+ wx.CallAfter(self.AppendProg, "\n[INFO] Processing completed.\n")
825
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
826
+ self.stop_flag.clear()
827
+
828
+ threading.Thread(target=background, daemon=True).start()
829
+
830
+
831
+ def StopParser(self, event):
832
+ self.stop_flag.set()
833
+ self.AppendProg("[INFO] Processing Stopped...\n")
834
+
835
+
836
+ def ShowText(self, event):
837
+ sel = self.listbox.GetSelections()
838
+ if not sel:
839
+ return
840
+ txt_path = os.path.splitext(self.files[sel[0]])[0] + ".txt"
841
+ self.text_ctrl.Clear()
842
+ if os.path.exists(txt_path):
843
+ with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
844
+ self.text_ctrl.SetValue(f.read())
845
+ else:
846
+ self.text_ctrl.SetValue("[No .txt file found]")
847
+
848
+ def AppendProg(self, text):
849
+ self.prog_ctrl.AppendText(text)
850
+
851
+
852
+ # -------------------- Einstiegspunkt --------------------
853
+ def main():
854
+ if len(sys.argv) > 1:
855
+ pdf_files = sys.argv[1:]
856
+ page_info, total_pages = get_total_pages(pdf_files)
857
+ tracker = StatusTracker(total_pages)
858
+
859
+ def cli_callback(status):
860
+ print(json.dumps(status))
861
+
862
+ for path, count in page_info:
863
+ save_pdf(path, count, tracker, parallel=(count > PARALLEL_THRESHOLD), progress_callback=cli_callback)
864
+ else:
865
+ app = wx.App(False)
866
+ frame = FileManager(None)
867
+ frame.Show()
868
+ app.MainLoop()
869
+
870
+
871
+ if __name__ == "__main__":
872
+ multiprocessing.freeze_support()
873
+ main()