Spaces:

datasciencesage
/

chatbot_gradio

Sleeping

App Files Files Community

datasciencesage commited on Jan 17

Commit

533248a

verified ·

1 Parent(s): 3375af0

Update step1_get_images.py

Browse files

Files changed (1) hide show

step1_get_images.py +27 -38

step1_get_images.py CHANGED Viewed

@@ -5,12 +5,8 @@ from pdf2image import convert_from_path
 from tqdm import tqdm
 from PIL import Image
-# Create directories
-def docx_to_pdf(docx_path, output_pdf_path,temp_pdf_dir):
     try:
         command = [
             "soffice",
@@ -21,19 +17,22 @@ def docx_to_pdf(docx_path, output_pdf_path,temp_pdf_dir):
             str(temp_pdf_dir),
             str(docx_path)
         ]
         result = subprocess.run(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
-            timeout=60  # Add timeout
         )
         if result.returncode == 0 and os.path.exists(output_pdf_path):
             print(f"✅ Converted to PDF: {output_pdf_path}")
             return True
         else:
             print(f"❌ Error converting {docx_path}: {result.stderr}")
             return False
     except FileNotFoundError:
         print("❌ Error: 'soffice' not found. Ensure LibreOffice is installed.")
         return False
@@ -42,61 +41,52 @@ def docx_to_pdf(docx_path, output_pdf_path,temp_pdf_dir):
         return False
 def pdf_to_images(pdf_path, output_base_path):
-    """Convert PDF to images with validation"""
     try:
-        # Convert all pages with higher DPI for better quality
         images = convert_from_path(
             pdf_path,
-            dpi=300,  # High DPI for math clarity
             fmt='png',
-            thread_count=4  # Parallel processing
         )
         if not images:
-            print(f"⚠️  No pages found in {pdf_path}")
             return 0
         saved_count = 0
         for page_num, image in enumerate(tqdm(images, desc="Converting pages"), 1):
             output_image_path = output_base_path.with_name(
                 f"{output_base_path.stem}_page{page_num}.png"
             )
-            # Validate image dimensions
             width, height = image.size
-            if width <= 0 or height <= 0:
-                print(f"⚠️  Skipping page {page_num}: Invalid dimensions ({width}x{height})")
-                continue
-            # Additional validation: check if image is blank
-            if width < 50 or height < 50:
-                print(f"⚠️  Skipping page {page_num}: Too small ({width}x{height})")
                 continue
-            # Save with optimization
             image.save(output_image_path, "PNG", optimize=True)
             saved_count += 1
         print(f"✅ Saved {saved_count}/{len(images)} pages")
         return saved_count
     except Exception as e:
         print(f"❌ Error processing {pdf_path}: {str(e)}")
         return 0
-# Process all .docx and .pdf files
-def get_images(INPUT_PATH_OF_DOCS = "all_documents/",TEMP_PDF_PATH = "temp_pdfs/",OUTPUT_PATH_OF_SCREENSHOTS = "images/"):
     total_processed = 0
     total_images = 0
-    INPUT_PATH_OF_DOCS = INPUT_PATH_OF_DOCS
-    TEMP_PDF_PATH = TEMP_PDF_PATH
-    OUTPUT_PATH_OF_SCREENSHOTS = OUTPUT_PATH_OF_SCREENSHOTS
     temp_pdf_dir = Path(TEMP_PDF_PATH)
     temp_pdf_dir.mkdir(parents=True, exist_ok=True)
     output_dir = Path(OUTPUT_PATH_OF_SCREENSHOTS)
     output_dir.mkdir(parents=True, exist_ok=True)
     for idx, paths in enumerate(os.listdir(INPUT_PATH_OF_DOCS), start=1):
         whole_path = os.path.join(INPUT_PATH_OF_DOCS, paths)
@@ -108,7 +98,7 @@ def get_images(INPUT_PATH_OF_DOCS = "all_documents/",TEMP_PDF_PATH = "temp_pdfs/
                 print(f"\n📄 Processing .docx: {paths} (Document #{idx})")
                 temp_pdf_path = temp_pdf_dir / f"{Path(paths).stem}.pdf"
-                if docx_to_pdf(whole_path, temp_pdf_path,temp_pdf_dir):
                     print("📸 Converting to images...")
                     count = pdf_to_images(temp_pdf_path, output_base_path)
                     total_images += count
@@ -127,6 +117,7 @@ def get_images(INPUT_PATH_OF_DOCS = "all_documents/",TEMP_PDF_PATH = "temp_pdfs/
     print(f"Total images saved: {total_images}")
     print(f"{'='*50}")
     print("\n🧹 Cleaning up temporary files...")
     for temp_pdf in temp_pdf_dir.glob("*.pdf"):
         try:
@@ -134,6 +125,4 @@ def get_images(INPUT_PATH_OF_DOCS = "all_documents/",TEMP_PDF_PATH = "temp_pdfs/
             print(f"✅ Deleted: {temp_pdf.name}")
         except Exception as e:
             print(f"❌ Error deleting {temp_pdf}: {str(e)}")
-if __name__=="__main__":
-    get_images()

 from tqdm import tqdm
 from PIL import Image
+def docx_to_pdf(docx_path, output_pdf_path, temp_pdf_dir):
+    """Convert DOCX to PDF using LibreOffice"""
     try:
         command = [
             "soffice",
             str(temp_pdf_dir),
             str(docx_path)
         ]
         result = subprocess.run(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
+            timeout=60
         )
         if result.returncode == 0 and os.path.exists(output_pdf_path):
             print(f"✅ Converted to PDF: {output_pdf_path}")
             return True
         else:
             print(f"❌ Error converting {docx_path}: {result.stderr}")
             return False
     except FileNotFoundError:
         print("❌ Error: 'soffice' not found. Ensure LibreOffice is installed.")
         return False
         return False
 def pdf_to_images(pdf_path, output_base_path):
+    """Convert PDF pages to high-quality PNG images"""
     try:
         images = convert_from_path(
             pdf_path,
+            dpi=300,  # High quality for math equations
             fmt='png',
+            thread_count=4
         )
         if not images:
+            print(f"⚠️ No pages found in {pdf_path}")
             return 0
         saved_count = 0
         for page_num, image in enumerate(tqdm(images, desc="Converting pages"), 1):
             output_image_path = output_base_path.with_name(
                 f"{output_base_path.stem}_page{page_num}.png"
             )
             width, height = image.size
+            if width <= 0 or height <= 0 or width < 50 or height < 50:
+                print(f"⚠️ Skipping page {page_num}: Invalid dimensions ({width}x{height})")
                 continue
             image.save(output_image_path, "PNG", optimize=True)
             saved_count += 1
         print(f"✅ Saved {saved_count}/{len(images)} pages")
         return saved_count
     except Exception as e:
         print(f"❌ Error processing {pdf_path}: {str(e)}")
         return 0
+def get_images(INPUT_PATH_OF_DOCS="all_documents/",
+               TEMP_PDF_PATH="temp_pdfs/",
+               OUTPUT_PATH_OF_SCREENSHOTS="images/"):
+    """Main function to convert all documents to images"""
     total_processed = 0
     total_images = 0
     temp_pdf_dir = Path(TEMP_PDF_PATH)
     temp_pdf_dir.mkdir(parents=True, exist_ok=True)
     output_dir = Path(OUTPUT_PATH_OF_SCREENSHOTS)
     output_dir.mkdir(parents=True, exist_ok=True)
     for idx, paths in enumerate(os.listdir(INPUT_PATH_OF_DOCS), start=1):
         whole_path = os.path.join(INPUT_PATH_OF_DOCS, paths)
                 print(f"\n📄 Processing .docx: {paths} (Document #{idx})")
                 temp_pdf_path = temp_pdf_dir / f"{Path(paths).stem}.pdf"
+                if docx_to_pdf(whole_path, temp_pdf_path, temp_pdf_dir):
                     print("📸 Converting to images...")
                     count = pdf_to_images(temp_pdf_path, output_base_path)
                     total_images += count
     print(f"Total images saved: {total_images}")
     print(f"{'='*50}")
+    # Cleanup temp PDFs
     print("\n🧹 Cleaning up temporary files...")
     for temp_pdf in temp_pdf_dir.glob("*.pdf"):
         try:
             print(f"✅ Deleted: {temp_pdf.name}")
         except Exception as e:
             print(f"❌ Error deleting {temp_pdf}: {str(e)}")