Spaces:
Sleeping
Sleeping
Delete tools/preprocess_pdfs.py
Browse files- tools/preprocess_pdfs.py +0 -57
tools/preprocess_pdfs.py
DELETED
|
@@ -1,57 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import fitz # PyMuPDF
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
|
| 5 |
-
# --- Configuration ---
|
| 6 |
-
# Set the base path to your knowledge base directory
|
| 7 |
-
KB_BASE_PATH = Path("/home/zclap/research/wurth/Chatbot/kb")
|
| 8 |
-
PDF_SOURCE_PATH = KB_BASE_PATH / "pdfs"
|
| 9 |
-
IMAGE_OUTPUT_PATH = KB_BASE_PATH / "processed_images"
|
| 10 |
-
|
| 11 |
-
def convert_pdfs_to_images():
|
| 12 |
-
"""
|
| 13 |
-
Iterates through machine-specific folders in the PDF source directory,
|
| 14 |
-
converts each page of each PDF into a PNG image, and saves it in a
|
| 15 |
-
corresponding folder in the output directory.
|
| 16 |
-
"""
|
| 17 |
-
print("Starting PDF to image conversion process...")
|
| 18 |
-
|
| 19 |
-
# Ensure the main output directory exists
|
| 20 |
-
IMAGE_OUTPUT_PATH.mkdir(exist_ok=True)
|
| 21 |
-
|
| 22 |
-
# Iterate through each item in the pdfs directory (e.g., 'machine1', 'machine2')
|
| 23 |
-
for machine_dir in PDF_SOURCE_PATH.iterdir():
|
| 24 |
-
if not machine_dir.is_dir():
|
| 25 |
-
continue
|
| 26 |
-
|
| 27 |
-
machine_name = machine_dir.name
|
| 28 |
-
print(f"\nProcessing machine: {machine_name}")
|
| 29 |
-
|
| 30 |
-
# Create a corresponding output directory for the machine's images
|
| 31 |
-
machine_output_dir = IMAGE_OUTPUT_PATH / machine_name
|
| 32 |
-
machine_output_dir.mkdir(exist_ok=True)
|
| 33 |
-
|
| 34 |
-
# Find all PDF files in the machine's source directory
|
| 35 |
-
for pdf_path in machine_dir.glob("*.pdf"):
|
| 36 |
-
pdf_name = pdf_path.stem # Get PDF filename without extension
|
| 37 |
-
print(f" - Converting PDF: {pdf_path.name}")
|
| 38 |
-
|
| 39 |
-
try:
|
| 40 |
-
doc = fitz.open(pdf_path)
|
| 41 |
-
# Iterate through each page of the PDF
|
| 42 |
-
for page_num in range(len(doc)):
|
| 43 |
-
page = doc.load_page(page_num)
|
| 44 |
-
pix = page.get_pixmap(dpi=200) # Higher DPI for better quality
|
| 45 |
-
|
| 46 |
-
# Define the output image file path
|
| 47 |
-
output_image_path = machine_output_dir / f"{pdf_name}_page_{page_num + 1}.png"
|
| 48 |
-
pix.save(output_image_path)
|
| 49 |
-
|
| 50 |
-
print(f" > Converted {len(doc)} pages successfully.")
|
| 51 |
-
except Exception as e:
|
| 52 |
-
print(f" ! Error processing {pdf_path.name}: {e}")
|
| 53 |
-
|
| 54 |
-
print("\nConversion process finished.")
|
| 55 |
-
|
| 56 |
-
if __name__ == "__main__":
|
| 57 |
-
convert_pdfs_to_images()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|