Spaces:

datasciencesage
/

chatbot_gradio

Sleeping

App Files Files Community

chatbot_gradio / step1_get_images.py

datasciencesage

Update step1_get_images.py

533248a verified 4 months ago

raw

history blame contribute delete

4.28 kB

	import os
	import subprocess
	from pathlib import Path
	from pdf2image import convert_from_path
	from tqdm import tqdm
	from PIL import Image

	def docx_to_pdf(docx_path, output_pdf_path, temp_pdf_dir):
	"""Convert DOCX to PDF using LibreOffice"""
	try:
	command = [
	"soffice",
	"--headless",
	"--convert-to",
	"pdf",
	"--outdir",
	str(temp_pdf_dir),
	str(docx_path)
	]

	result = subprocess.run(
	command,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	timeout=60
	)

	if result.returncode == 0 and os.path.exists(output_pdf_path):
	print(f"✅ Converted to PDF: {output_pdf_path}")
	return True
	else:
	print(f"❌ Error converting {docx_path}: {result.stderr}")
	return False

	except FileNotFoundError:
	print("❌ Error: 'soffice' not found. Ensure LibreOffice is installed.")
	return False
	except Exception as e:
	print(f"❌ Error converting {docx_path}: {str(e)}")
	return False

	def pdf_to_images(pdf_path, output_base_path):
	"""Convert PDF pages to high-quality PNG images"""
	try:
	images = convert_from_path(
	pdf_path,
	dpi=300, # High quality for math equations
	fmt='png',
	thread_count=4
	)

	if not images:
	print(f"⚠️ No pages found in {pdf_path}")
	return 0

	saved_count = 0
	for page_num, image in enumerate(tqdm(images, desc="Converting pages"), 1):
	output_image_path = output_base_path.with_name(
	f"{output_base_path.stem}_page{page_num}.png"
	)

	width, height = image.size
	if width <= 0 or height <= 0 or width < 50 or height < 50:
	print(f"⚠️ Skipping page {page_num}: Invalid dimensions ({width}x{height})")
	continue

	image.save(output_image_path, "PNG", optimize=True)
	saved_count += 1

	print(f"✅ Saved {saved_count}/{len(images)} pages")
	return saved_count

	except Exception as e:
	print(f"❌ Error processing {pdf_path}: {str(e)}")
	return 0

	def get_images(INPUT_PATH_OF_DOCS="all_documents/",
	TEMP_PDF_PATH="temp_pdfs/",
	OUTPUT_PATH_OF_SCREENSHOTS="images/"):
	"""Main function to convert all documents to images"""
	total_processed = 0
	total_images = 0

	temp_pdf_dir = Path(TEMP_PDF_PATH)
	temp_pdf_dir.mkdir(parents=True, exist_ok=True)

	output_dir = Path(OUTPUT_PATH_OF_SCREENSHOTS)
	output_dir.mkdir(parents=True, exist_ok=True)

	for idx, paths in enumerate(os.listdir(INPUT_PATH_OF_DOCS), start=1):
	whole_path = os.path.join(INPUT_PATH_OF_DOCS, paths)

	if os.path.isfile(whole_path):
	output_base_path = output_dir / Path(paths).stem

	if paths.lower().endswith('.docx'):
	print(f"\n📄 Processing .docx: {paths} (Document #{idx})")
	temp_pdf_path = temp_pdf_dir / f"{Path(paths).stem}.pdf"

	if docx_to_pdf(whole_path, temp_pdf_path, temp_pdf_dir):
	print("📸 Converting to images...")
	count = pdf_to_images(temp_pdf_path, output_base_path)
	total_images += count
	total_processed += 1

	elif paths.lower().endswith('.pdf'):
	print(f"\n📄 Processing .pdf: {paths} (Document #{idx})")
	count = pdf_to_images(whole_path, output_base_path)
	total_images += count
	total_processed += 1

	print(f"\n{'='*50}")
	print(f"📊 CONVERSION SUMMARY")
	print(f"{'='*50}")
	print(f"Documents processed: {total_processed}")
	print(f"Total images saved: {total_images}")
	print(f"{'='*50}")

	# Cleanup temp PDFs
	print("\n🧹 Cleaning up temporary files...")
	for temp_pdf in temp_pdf_dir.glob("*.pdf"):
	try:
	temp_pdf.unlink()
	print(f"✅ Deleted: {temp_pdf.name}")
	except Exception as e:
	print(f"❌ Error deleting {temp_pdf}: {str(e)}")