chatbot_gradio / step1_get_images.py
datasciencesage's picture
Update step1_get_images.py
533248a verified
import os
import subprocess
from pathlib import Path
from pdf2image import convert_from_path
from tqdm import tqdm
from PIL import Image
def docx_to_pdf(docx_path, output_pdf_path, temp_pdf_dir):
"""Convert DOCX to PDF using LibreOffice"""
try:
command = [
"soffice",
"--headless",
"--convert-to",
"pdf",
"--outdir",
str(temp_pdf_dir),
str(docx_path)
]
result = subprocess.run(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=60
)
if result.returncode == 0 and os.path.exists(output_pdf_path):
print(f"βœ… Converted to PDF: {output_pdf_path}")
return True
else:
print(f"❌ Error converting {docx_path}: {result.stderr}")
return False
except FileNotFoundError:
print("❌ Error: 'soffice' not found. Ensure LibreOffice is installed.")
return False
except Exception as e:
print(f"❌ Error converting {docx_path}: {str(e)}")
return False
def pdf_to_images(pdf_path, output_base_path):
"""Convert PDF pages to high-quality PNG images"""
try:
images = convert_from_path(
pdf_path,
dpi=300, # High quality for math equations
fmt='png',
thread_count=4
)
if not images:
print(f"⚠️ No pages found in {pdf_path}")
return 0
saved_count = 0
for page_num, image in enumerate(tqdm(images, desc="Converting pages"), 1):
output_image_path = output_base_path.with_name(
f"{output_base_path.stem}_page{page_num}.png"
)
width, height = image.size
if width <= 0 or height <= 0 or width < 50 or height < 50:
print(f"⚠️ Skipping page {page_num}: Invalid dimensions ({width}x{height})")
continue
image.save(output_image_path, "PNG", optimize=True)
saved_count += 1
print(f"βœ… Saved {saved_count}/{len(images)} pages")
return saved_count
except Exception as e:
print(f"❌ Error processing {pdf_path}: {str(e)}")
return 0
def get_images(INPUT_PATH_OF_DOCS="all_documents/",
TEMP_PDF_PATH="temp_pdfs/",
OUTPUT_PATH_OF_SCREENSHOTS="images/"):
"""Main function to convert all documents to images"""
total_processed = 0
total_images = 0
temp_pdf_dir = Path(TEMP_PDF_PATH)
temp_pdf_dir.mkdir(parents=True, exist_ok=True)
output_dir = Path(OUTPUT_PATH_OF_SCREENSHOTS)
output_dir.mkdir(parents=True, exist_ok=True)
for idx, paths in enumerate(os.listdir(INPUT_PATH_OF_DOCS), start=1):
whole_path = os.path.join(INPUT_PATH_OF_DOCS, paths)
if os.path.isfile(whole_path):
output_base_path = output_dir / Path(paths).stem
if paths.lower().endswith('.docx'):
print(f"\nπŸ“„ Processing .docx: {paths} (Document #{idx})")
temp_pdf_path = temp_pdf_dir / f"{Path(paths).stem}.pdf"
if docx_to_pdf(whole_path, temp_pdf_path, temp_pdf_dir):
print("πŸ“Έ Converting to images...")
count = pdf_to_images(temp_pdf_path, output_base_path)
total_images += count
total_processed += 1
elif paths.lower().endswith('.pdf'):
print(f"\nπŸ“„ Processing .pdf: {paths} (Document #{idx})")
count = pdf_to_images(whole_path, output_base_path)
total_images += count
total_processed += 1
print(f"\n{'='*50}")
print(f"πŸ“Š CONVERSION SUMMARY")
print(f"{'='*50}")
print(f"Documents processed: {total_processed}")
print(f"Total images saved: {total_images}")
print(f"{'='*50}")
# Cleanup temp PDFs
print("\n🧹 Cleaning up temporary files...")
for temp_pdf in temp_pdf_dir.glob("*.pdf"):
try:
temp_pdf.unlink()
print(f"βœ… Deleted: {temp_pdf.name}")
except Exception as e:
print(f"❌ Error deleting {temp_pdf}: {str(e)}")