Spaces:
Sleeping
Sleeping
| import os | |
| import subprocess | |
| from pathlib import Path | |
| from pdf2image import convert_from_path | |
| from tqdm import tqdm | |
| from PIL import Image | |
| def docx_to_pdf(docx_path, output_pdf_path, temp_pdf_dir): | |
| """Convert DOCX to PDF using LibreOffice""" | |
| try: | |
| command = [ | |
| "soffice", | |
| "--headless", | |
| "--convert-to", | |
| "pdf", | |
| "--outdir", | |
| str(temp_pdf_dir), | |
| str(docx_path) | |
| ] | |
| result = subprocess.run( | |
| command, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE, | |
| text=True, | |
| timeout=60 | |
| ) | |
| if result.returncode == 0 and os.path.exists(output_pdf_path): | |
| print(f"β Converted to PDF: {output_pdf_path}") | |
| return True | |
| else: | |
| print(f"β Error converting {docx_path}: {result.stderr}") | |
| return False | |
| except FileNotFoundError: | |
| print("β Error: 'soffice' not found. Ensure LibreOffice is installed.") | |
| return False | |
| except Exception as e: | |
| print(f"β Error converting {docx_path}: {str(e)}") | |
| return False | |
| def pdf_to_images(pdf_path, output_base_path): | |
| """Convert PDF pages to high-quality PNG images""" | |
| try: | |
| images = convert_from_path( | |
| pdf_path, | |
| dpi=300, # High quality for math equations | |
| fmt='png', | |
| thread_count=4 | |
| ) | |
| if not images: | |
| print(f"β οΈ No pages found in {pdf_path}") | |
| return 0 | |
| saved_count = 0 | |
| for page_num, image in enumerate(tqdm(images, desc="Converting pages"), 1): | |
| output_image_path = output_base_path.with_name( | |
| f"{output_base_path.stem}_page{page_num}.png" | |
| ) | |
| width, height = image.size | |
| if width <= 0 or height <= 0 or width < 50 or height < 50: | |
| print(f"β οΈ Skipping page {page_num}: Invalid dimensions ({width}x{height})") | |
| continue | |
| image.save(output_image_path, "PNG", optimize=True) | |
| saved_count += 1 | |
| print(f"β Saved {saved_count}/{len(images)} pages") | |
| return saved_count | |
| except Exception as e: | |
| print(f"β Error processing {pdf_path}: {str(e)}") | |
| return 0 | |
| def get_images(INPUT_PATH_OF_DOCS="all_documents/", | |
| TEMP_PDF_PATH="temp_pdfs/", | |
| OUTPUT_PATH_OF_SCREENSHOTS="images/"): | |
| """Main function to convert all documents to images""" | |
| total_processed = 0 | |
| total_images = 0 | |
| temp_pdf_dir = Path(TEMP_PDF_PATH) | |
| temp_pdf_dir.mkdir(parents=True, exist_ok=True) | |
| output_dir = Path(OUTPUT_PATH_OF_SCREENSHOTS) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| for idx, paths in enumerate(os.listdir(INPUT_PATH_OF_DOCS), start=1): | |
| whole_path = os.path.join(INPUT_PATH_OF_DOCS, paths) | |
| if os.path.isfile(whole_path): | |
| output_base_path = output_dir / Path(paths).stem | |
| if paths.lower().endswith('.docx'): | |
| print(f"\nπ Processing .docx: {paths} (Document #{idx})") | |
| temp_pdf_path = temp_pdf_dir / f"{Path(paths).stem}.pdf" | |
| if docx_to_pdf(whole_path, temp_pdf_path, temp_pdf_dir): | |
| print("πΈ Converting to images...") | |
| count = pdf_to_images(temp_pdf_path, output_base_path) | |
| total_images += count | |
| total_processed += 1 | |
| elif paths.lower().endswith('.pdf'): | |
| print(f"\nπ Processing .pdf: {paths} (Document #{idx})") | |
| count = pdf_to_images(whole_path, output_base_path) | |
| total_images += count | |
| total_processed += 1 | |
| print(f"\n{'='*50}") | |
| print(f"π CONVERSION SUMMARY") | |
| print(f"{'='*50}") | |
| print(f"Documents processed: {total_processed}") | |
| print(f"Total images saved: {total_images}") | |
| print(f"{'='*50}") | |
| # Cleanup temp PDFs | |
| print("\nπ§Ή Cleaning up temporary files...") | |
| for temp_pdf in temp_pdf_dir.glob("*.pdf"): | |
| try: | |
| temp_pdf.unlink() | |
| print(f"β Deleted: {temp_pdf.name}") | |
| except Exception as e: | |
| print(f"β Error deleting {temp_pdf}: {str(e)}") | |