Spaces:

datasciencesage
/

chatbot_gradio

Sleeping

App Files Files Community

datasciencesage commited on Nov 20, 2025

Commit

5b2476c

verified ·

1 Parent(s): 10acd92

Create step1_get_images.py

Browse files

Files changed (1) hide show

step1_get_images.py +139 -0

step1_get_images.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import subprocess
+from pathlib import Path
+from pdf2image import convert_from_path
+from tqdm import tqdm
+from PIL import Image
+# Create directories
+def docx_to_pdf(docx_path, output_pdf_path,temp_pdf_dir):
+    try:
+        command = [
+            "soffice",
+            "--headless",
+            "--convert-to",
+            "pdf",
+            "--outdir",
+            str(temp_pdf_dir),
+            str(docx_path)
+        ]
+        result = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            timeout=60  # Add timeout
+        )
+        if result.returncode == 0 and os.path.exists(output_pdf_path):
+            print(f"✅ Converted to PDF: {output_pdf_path}")
+            return True
+        else:
+            print(f"❌ Error converting {docx_path}: {result.stderr}")
+            return False
+    except FileNotFoundError:
+        print("❌ Error: 'soffice' not found. Ensure LibreOffice is installed.")
+        return False
+    except Exception as e:
+        print(f"❌ Error converting {docx_path}: {str(e)}")
+        return False
+def pdf_to_images(pdf_path, output_base_path):
+    """Convert PDF to images with validation"""
+    try:
+        # Convert all pages with higher DPI for better quality
+        images = convert_from_path(
+            pdf_path,
+            dpi=300,  # High DPI for math clarity
+            fmt='png',
+            thread_count=4  # Parallel processing
+        )
+        if not images:
+            print(f"⚠️  No pages found in {pdf_path}")
+            return 0
+        saved_count = 0
+        for page_num, image in enumerate(tqdm(images, desc="Converting pages"), 1):
+            output_image_path = output_base_path.with_name(
+                f"{output_base_path.stem}_page{page_num}.png"
+            )
+            # Validate image dimensions
+            width, height = image.size
+            if width <= 0 or height <= 0:
+                print(f"⚠️  Skipping page {page_num}: Invalid dimensions ({width}x{height})")
+                continue
+            # Additional validation: check if image is blank
+            if width < 50 or height < 50:
+                print(f"⚠️  Skipping page {page_num}: Too small ({width}x{height})")
+                continue
+            # Save with optimization
+            image.save(output_image_path, "PNG", optimize=True)
+            saved_count += 1
+        print(f"✅ Saved {saved_count}/{len(images)} pages")
+        return saved_count
+    except Exception as e:
+        print(f"❌ Error processing {pdf_path}: {str(e)}")
+        return 0
+# Process all .docx and .pdf files
+def get_images(INPUT_PATH_OF_DOCS = "all_documents/",TEMP_PDF_PATH = "temp_pdfs/",OUTPUT_PATH_OF_SCREENSHOTS = "images/"):
+    total_processed = 0
+    total_images = 0
+    INPUT_PATH_OF_DOCS = INPUT_PATH_OF_DOCS
+    TEMP_PDF_PATH = TEMP_PDF_PATH
+    OUTPUT_PATH_OF_SCREENSHOTS = OUTPUT_PATH_OF_SCREENSHOTS
+    temp_pdf_dir = Path(TEMP_PDF_PATH)
+    temp_pdf_dir.mkdir(parents=True, exist_ok=True)
+    output_dir = Path(OUTPUT_PATH_OF_SCREENSHOTS)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for idx, paths in enumerate(os.listdir(INPUT_PATH_OF_DOCS), start=1):
+        whole_path = os.path.join(INPUT_PATH_OF_DOCS, paths)
+        if os.path.isfile(whole_path):
+            output_base_path = output_dir / Path(paths).stem
+            if paths.lower().endswith('.docx'):
+                print(f"\n📄 Processing .docx: {paths} (Document #{idx})")
+                temp_pdf_path = temp_pdf_dir / f"{Path(paths).stem}.pdf"
+                if docx_to_pdf(whole_path, temp_pdf_path,temp_pdf_dir):
+                    print("📸 Converting to images...")
+                    count = pdf_to_images(temp_pdf_path, output_base_path)
+                    total_images += count
+                    total_processed += 1
+            elif paths.lower().endswith('.pdf'):
+                print(f"\n📄 Processing .pdf: {paths} (Document #{idx})")
+                count = pdf_to_images(whole_path, output_base_path)
+                total_images += count
+                total_processed += 1
+    print(f"\n{'='*50}")
+    print(f"📊 CONVERSION SUMMARY")
+    print(f"{'='*50}")
+    print(f"Documents processed: {total_processed}")
+    print(f"Total images saved: {total_images}")
+    print(f"{'='*50}")
+    print("\n🧹 Cleaning up temporary files...")
+    for temp_pdf in temp_pdf_dir.glob("*.pdf"):
+        try:
+            temp_pdf.unlink()
+            print(f"✅ Deleted: {temp_pdf.name}")
+        except Exception as e:
+            print(f"❌ Error deleting {temp_pdf}: {str(e)}")
+if __name__=="__main__":
+    get_images()