Spaces:

zclap
/

Wurth_LouisAssistant

Sleeping

App Files Files Community

Akshajzclap commited on Oct 23, 2025

Commit

4fe0a0b

verified ·

1 Parent(s): daf7565

Delete tools/preprocess_pdfs.py

Browse files

Files changed (1) hide show

tools/preprocess_pdfs.py +0 -57

tools/preprocess_pdfs.py DELETED Viewed

@@ -1,57 +0,0 @@
-import os
-import fitz  # PyMuPDF
-from pathlib import Path
-# --- Configuration ---
-# Set the base path to your knowledge base directory
-KB_BASE_PATH = Path("/home/zclap/research/wurth/Chatbot/kb")
-PDF_SOURCE_PATH = KB_BASE_PATH / "pdfs"
-IMAGE_OUTPUT_PATH = KB_BASE_PATH / "processed_images"
-def convert_pdfs_to_images():
-    """
-    Iterates through machine-specific folders in the PDF source directory,
-    converts each page of each PDF into a PNG image, and saves it in a
-    corresponding folder in the output directory.
-    """
-    print("Starting PDF to image conversion process...")
-    # Ensure the main output directory exists
-    IMAGE_OUTPUT_PATH.mkdir(exist_ok=True)
-    # Iterate through each item in the pdfs directory (e.g., 'machine1', 'machine2')
-    for machine_dir in PDF_SOURCE_PATH.iterdir():
-        if not machine_dir.is_dir():
-            continue
-        machine_name = machine_dir.name
-        print(f"\nProcessing machine: {machine_name}")
-        # Create a corresponding output directory for the machine's images
-        machine_output_dir = IMAGE_OUTPUT_PATH / machine_name
-        machine_output_dir.mkdir(exist_ok=True)
-        # Find all PDF files in the machine's source directory
-        for pdf_path in machine_dir.glob("*.pdf"):
-            pdf_name = pdf_path.stem  # Get PDF filename without extension
-            print(f"  - Converting PDF: {pdf_path.name}")
-            try:
-                doc = fitz.open(pdf_path)
-                # Iterate through each page of the PDF
-                for page_num in range(len(doc)):
-                    page = doc.load_page(page_num)
-                    pix = page.get_pixmap(dpi=200)  # Higher DPI for better quality
-                    # Define the output image file path
-                    output_image_path = machine_output_dir / f"{pdf_name}_page_{page_num + 1}.png"
-                    pix.save(output_image_path)
-                print(f"    > Converted {len(doc)} pages successfully.")
-            except Exception as e:
-                print(f"    ! Error processing {pdf_path.name}: {e}")
-    print("\nConversion process finished.")
-if __name__ == "__main__":
-    convert_pdfs_to_images()