Akshajzclap commited on
Commit
4fe0a0b
·
verified ·
1 Parent(s): daf7565

Delete tools/preprocess_pdfs.py

Browse files
Files changed (1) hide show
  1. tools/preprocess_pdfs.py +0 -57
tools/preprocess_pdfs.py DELETED
@@ -1,57 +0,0 @@
1
- import os
2
- import fitz # PyMuPDF
3
- from pathlib import Path
4
-
5
- # --- Configuration ---
6
- # Set the base path to your knowledge base directory
7
- KB_BASE_PATH = Path("/home/zclap/research/wurth/Chatbot/kb")
8
- PDF_SOURCE_PATH = KB_BASE_PATH / "pdfs"
9
- IMAGE_OUTPUT_PATH = KB_BASE_PATH / "processed_images"
10
-
11
- def convert_pdfs_to_images():
12
- """
13
- Iterates through machine-specific folders in the PDF source directory,
14
- converts each page of each PDF into a PNG image, and saves it in a
15
- corresponding folder in the output directory.
16
- """
17
- print("Starting PDF to image conversion process...")
18
-
19
- # Ensure the main output directory exists
20
- IMAGE_OUTPUT_PATH.mkdir(exist_ok=True)
21
-
22
- # Iterate through each item in the pdfs directory (e.g., 'machine1', 'machine2')
23
- for machine_dir in PDF_SOURCE_PATH.iterdir():
24
- if not machine_dir.is_dir():
25
- continue
26
-
27
- machine_name = machine_dir.name
28
- print(f"\nProcessing machine: {machine_name}")
29
-
30
- # Create a corresponding output directory for the machine's images
31
- machine_output_dir = IMAGE_OUTPUT_PATH / machine_name
32
- machine_output_dir.mkdir(exist_ok=True)
33
-
34
- # Find all PDF files in the machine's source directory
35
- for pdf_path in machine_dir.glob("*.pdf"):
36
- pdf_name = pdf_path.stem # Get PDF filename without extension
37
- print(f" - Converting PDF: {pdf_path.name}")
38
-
39
- try:
40
- doc = fitz.open(pdf_path)
41
- # Iterate through each page of the PDF
42
- for page_num in range(len(doc)):
43
- page = doc.load_page(page_num)
44
- pix = page.get_pixmap(dpi=200) # Higher DPI for better quality
45
-
46
- # Define the output image file path
47
- output_image_path = machine_output_dir / f"{pdf_name}_page_{page_num + 1}.png"
48
- pix.save(output_image_path)
49
-
50
- print(f" > Converted {len(doc)} pages successfully.")
51
- except Exception as e:
52
- print(f" ! Error processing {pdf_path.name}: {e}")
53
-
54
- print("\nConversion process finished.")
55
-
56
- if __name__ == "__main__":
57
- convert_pdfs_to_images()