Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import zipfile | |
| import os | |
| import json | |
| import io | |
| from pathlib import Path | |
| from datetime import datetime | |
| import tempfile | |
| import shutil | |
| from PIL import Image | |
| class PDFAssetExtractor: | |
| def __init__(self): | |
| self.logs = [] | |
| def log(self, message, level="INFO"): | |
| """Add a log message with timestamp""" | |
| timestamp = datetime.now().strftime("%H:%M:%S") | |
| log_entry = f"[{timestamp}] [{level}] {message}" | |
| self.logs.append(log_entry) | |
| return "\n".join(self.logs) | |
| def extract_assets(self, pdf_file, progress=gr.Progress()): | |
| """Main extraction function""" | |
| self.logs = [] | |
| if pdf_file is None: | |
| return None, self.log("ERROR: No PDF file provided", "ERROR"), "" | |
| try: | |
| # Create temporary directory for extraction | |
| temp_dir = tempfile.mkdtemp() | |
| assets_dir = os.path.join(temp_dir, "pdf_assets") | |
| os.makedirs(assets_dir, exist_ok=True) | |
| self.log(f"Starting extraction from: {os.path.basename(pdf_file.name)}") | |
| self.log(f"Temporary directory: {assets_dir}") | |
| # Open PDF | |
| doc = fitz.open(pdf_file.name) | |
| self.log(f"PDF opened successfully - {len(doc)} pages") | |
| # Extract metadata | |
| progress(0.1, desc="Extracting metadata...") | |
| metadata = self.extract_metadata(doc, assets_dir) | |
| # Extract text | |
| progress(0.2, desc="Extracting text...") | |
| text_stats = self.extract_text(doc, assets_dir) | |
| # Extract images | |
| progress(0.4, desc="Extracting images...") | |
| image_stats = self.extract_images(doc, assets_dir) | |
| # Extract fonts | |
| progress(0.6, desc="Extracting fonts...") | |
| font_stats = self.extract_fonts(doc, assets_dir) | |
| # Extract vector graphics | |
| progress(0.8, desc="Extracting vector graphics...") | |
| vector_stats = self.extract_vectors(doc, assets_dir) | |
| # Create summary | |
| summary = self.create_summary(metadata, text_stats, image_stats, font_stats, vector_stats) | |
| # Create zip file | |
| progress(0.9, desc="Creating zip archive...") | |
| zip_path = self.create_zip(assets_dir, temp_dir) | |
| doc.close() | |
| progress(1.0, desc="Complete!") | |
| self.log("β Extraction completed successfully!") | |
| return zip_path, "\n".join(self.logs), summary | |
| except Exception as e: | |
| self.log(f"ERROR: {str(e)}", "ERROR") | |
| return None, "\n".join(self.logs), f"Error: {str(e)}" | |
| def extract_metadata(self, doc, assets_dir): | |
| """Extract PDF metadata""" | |
| try: | |
| metadata = doc.metadata | |
| self.log(f"Extracting metadata...") | |
| # Save metadata as JSON | |
| metadata_path = os.path.join(assets_dir, "metadata.json") | |
| with open(metadata_path, 'w', encoding='utf-8') as f: | |
| json.dump(metadata, f, indent=2, default=str) | |
| self.log(f"β Metadata saved to metadata.json") | |
| return metadata | |
| except Exception as e: | |
| self.log(f"β Warning: Could not extract metadata - {str(e)}", "WARNING") | |
| return {} | |
| def extract_text(self, doc, assets_dir): | |
| """Extract text from PDF""" | |
| try: | |
| text_dir = os.path.join(assets_dir, "text") | |
| os.makedirs(text_dir, exist_ok=True) | |
| total_chars = 0 | |
| # Extract text from each page | |
| full_text = [] | |
| for page_num, page in enumerate(doc, 1): | |
| text = page.get_text() | |
| full_text.append(f"--- Page {page_num} ---\n{text}\n") | |
| total_chars += len(text) | |
| # Save full text | |
| text_path = os.path.join(text_dir, "full_text.txt") | |
| with open(text_path, 'w', encoding='utf-8') as f: | |
| f.write("\n".join(full_text)) | |
| self.log(f"β Extracted {total_chars:,} characters of text across {len(doc)} pages") | |
| return {"pages": len(doc), "characters": total_chars} | |
| except Exception as e: | |
| self.log(f"β Warning: Could not extract text - {str(e)}", "WARNING") | |
| return {"pages": 0, "characters": 0} | |
| def extract_images(self, doc, assets_dir): | |
| """Extract images from PDF""" | |
| try: | |
| images_dir = os.path.join(assets_dir, "images") | |
| os.makedirs(images_dir, exist_ok=True) | |
| image_count = 0 | |
| for page_num, page in enumerate(doc, 1): | |
| image_list = page.get_images(full=True) | |
| for img_index, img in enumerate(image_list): | |
| xref = img[0] | |
| try: | |
| base_image = doc.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| image_ext = base_image["ext"] | |
| # Save image | |
| image_filename = f"page{page_num}_img{img_index + 1}.{image_ext}" | |
| image_path = os.path.join(images_dir, image_filename) | |
| with open(image_path, "wb") as img_file: | |
| img_file.write(image_bytes) | |
| image_count += 1 | |
| except Exception as e: | |
| self.log(f"β Warning: Could not extract image {img_index + 1} from page {page_num}", "WARNING") | |
| self.log(f"β Extracted {image_count} images") | |
| return {"count": image_count} | |
| except Exception as e: | |
| self.log(f"β Warning: Could not extract images - {str(e)}", "WARNING") | |
| return {"count": 0} | |
| def extract_fonts(self, doc, assets_dir): | |
| """Extract font information from PDF""" | |
| try: | |
| fonts_dir = os.path.join(assets_dir, "fonts") | |
| os.makedirs(fonts_dir, exist_ok=True) | |
| fonts_info = [] | |
| for page_num, page in enumerate(doc, 1): | |
| fonts = page.get_fonts(full=True) | |
| for font in fonts: | |
| font_info = { | |
| "page": page_num, | |
| "xref": font[0], | |
| "name": font[3], | |
| "type": font[1], | |
| "encoding": font[2] | |
| } | |
| fonts_info.append(font_info) | |
| # Save fonts information | |
| if fonts_info: | |
| fonts_path = os.path.join(fonts_dir, "fonts_info.json") | |
| with open(fonts_path, 'w', encoding='utf-8') as f: | |
| json.dump(fonts_info, f, indent=2) | |
| unique_fonts = len(set(f["name"] for f in fonts_info)) | |
| self.log(f"β Found {len(fonts_info)} font references ({unique_fonts} unique fonts)") | |
| else: | |
| self.log(f"βΉ No fonts found") | |
| return {"total": len(fonts_info), "unique": len(set(f["name"] for f in fonts_info)) if fonts_info else 0} | |
| except Exception as e: | |
| self.log(f"β Warning: Could not extract fonts - {str(e)}", "WARNING") | |
| return {"total": 0, "unique": 0} | |
| def extract_vectors(self, doc, assets_dir): | |
| """Extract vector graphics information""" | |
| try: | |
| vectors_dir = os.path.join(assets_dir, "vectors") | |
| os.makedirs(vectors_dir, exist_ok=True) | |
| vector_count = 0 | |
| # Export each page as SVG (contains vector information) | |
| for page_num, page in enumerate(doc, 1): | |
| try: | |
| svg_content = page.get_svg_image() | |
| svg_path = os.path.join(vectors_dir, f"page{page_num}.svg") | |
| with open(svg_path, 'w', encoding='utf-8') as f: | |
| f.write(svg_content) | |
| vector_count += 1 | |
| except Exception as e: | |
| self.log(f"β Could not extract vectors from page {page_num}", "WARNING") | |
| if vector_count > 0: | |
| self.log(f"β Exported {vector_count} pages as SVG (vector format)") | |
| else: | |
| self.log(f"βΉ No vector graphics extracted") | |
| return {"count": vector_count} | |
| except Exception as e: | |
| self.log(f"β Warning: Could not extract vectors - {str(e)}", "WARNING") | |
| return {"count": 0} | |
| def create_summary(self, metadata, text_stats, image_stats, font_stats, vector_stats): | |
| """Create extraction summary""" | |
| summary = f""" | |
| # PDF Asset Extraction Summary | |
| ## Document Information | |
| - **Title**: {metadata.get('title', 'N/A')} | |
| - **Author**: {metadata.get('author', 'N/A')} | |
| - **Subject**: {metadata.get('subject', 'N/A')} | |
| - **Creator**: {metadata.get('creator', 'N/A')} | |
| - **Producer**: {metadata.get('producer', 'N/A')} | |
| - **Creation Date**: {metadata.get('creationDate', 'N/A')} | |
| ## Extracted Assets | |
| ### Text | |
| - **Pages**: {text_stats.get('pages', 0)} | |
| - **Total Characters**: {text_stats.get('characters', 0):,} | |
| ### Images | |
| - **Total Images**: {image_stats.get('count', 0)} | |
| ### Fonts | |
| - **Total Font References**: {font_stats.get('total', 0)} | |
| - **Unique Fonts**: {font_stats.get('unique', 0)} | |
| ### Vector Graphics | |
| - **SVG Pages Exported**: {vector_stats.get('count', 0)} | |
| ## Output Structure | |
| ``` | |
| pdf_assets/ | |
| βββ metadata.json # PDF metadata | |
| βββ text/ | |
| β βββ full_text.txt # Extracted text | |
| βββ images/ # Extracted images | |
| βββ fonts/ | |
| β βββ fonts_info.json # Font information | |
| βββ vectors/ # SVG exports | |
| ``` | |
| """ | |
| return summary | |
| def create_zip(self, assets_dir, temp_dir): | |
| """Create zip file of all extracted assets""" | |
| try: | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| zip_filename = f"pdf_assets_{timestamp}.zip" | |
| zip_path = os.path.join(temp_dir, zip_filename) | |
| with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
| for root, dirs, files in os.walk(assets_dir): | |
| for file in files: | |
| file_path = os.path.join(root, file) | |
| arcname = os.path.relpath(file_path, temp_dir) | |
| zipf.write(file_path, arcname) | |
| zip_size = os.path.getsize(zip_path) / (1024 * 1024) # MB | |
| self.log(f"β Created zip archive: {zip_filename} ({zip_size:.2f} MB)") | |
| return zip_path | |
| except Exception as e: | |
| self.log(f"ERROR: Could not create zip file - {str(e)}", "ERROR") | |
| return None | |
| # Initialize extractor | |
| extractor = PDFAssetExtractor() | |
| # Create Gradio interface | |
| with gr.Blocks(title="PDF Asset Extractor") as demo: | |
| gr.Markdown(""" | |
| # π PDF Asset Extractor | |
| Extract fonts, icons, vectors, images, text, and metadata from PDF files. | |
| All assets are packaged into a downloadable zip file. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pdf_input = gr.File( | |
| label="Upload PDF", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ) | |
| extract_btn = gr.Button("π Extract Assets", variant="primary", size="lg") | |
| gr.Markdown(""" | |
| ### What gets extracted: | |
| - π **Text**: Full text content from all pages | |
| - πΌοΈ **Images**: All embedded images | |
| - π€ **Fonts**: Font information and references | |
| - π¨ **Vectors**: SVG exports of each page | |
| - π **Metadata**: Document properties and info | |
| """) | |
| with gr.Column(scale=2): | |
| with gr.Tabs(): | |
| with gr.Tab("π Summary"): | |
| summary_output = gr.Markdown() | |
| with gr.Tab("π Extraction Logs"): | |
| logs_output = gr.Textbox( | |
| label="Live Logs", | |
| lines=20, | |
| max_lines=30, | |
| interactive=False | |
| ) | |
| download_output = gr.File(label="π¦ Download Assets (ZIP)") | |
| # Connect the extraction function | |
| extract_btn.click( | |
| fn=extractor.extract_assets, | |
| inputs=[pdf_input], | |
| outputs=[download_output, logs_output, summary_output] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### π‘ Tips: | |
| - Larger PDFs may take longer to process | |
| - All extracted assets are organized in folders | |
| - The zip file includes a complete directory structure | |
| - Check the logs tab for detailed extraction information | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |