PDF-Parse / app.py
namelessai's picture
Update for Gradio v6 compatibility
b85d2c9 verified
import gradio as gr
import fitz # PyMuPDF
import zipfile
import os
import json
import io
from pathlib import Path
from datetime import datetime
import tempfile
import shutil
from PIL import Image
class PDFAssetExtractor:
def __init__(self):
self.logs = []
def log(self, message, level="INFO"):
"""Add a log message with timestamp"""
timestamp = datetime.now().strftime("%H:%M:%S")
log_entry = f"[{timestamp}] [{level}] {message}"
self.logs.append(log_entry)
return "\n".join(self.logs)
def extract_assets(self, pdf_file, progress=gr.Progress()):
"""Main extraction function"""
self.logs = []
if pdf_file is None:
return None, self.log("ERROR: No PDF file provided", "ERROR"), ""
try:
# Create temporary directory for extraction
temp_dir = tempfile.mkdtemp()
assets_dir = os.path.join(temp_dir, "pdf_assets")
os.makedirs(assets_dir, exist_ok=True)
self.log(f"Starting extraction from: {os.path.basename(pdf_file.name)}")
self.log(f"Temporary directory: {assets_dir}")
# Open PDF
doc = fitz.open(pdf_file.name)
self.log(f"PDF opened successfully - {len(doc)} pages")
# Extract metadata
progress(0.1, desc="Extracting metadata...")
metadata = self.extract_metadata(doc, assets_dir)
# Extract text
progress(0.2, desc="Extracting text...")
text_stats = self.extract_text(doc, assets_dir)
# Extract images
progress(0.4, desc="Extracting images...")
image_stats = self.extract_images(doc, assets_dir)
# Extract fonts
progress(0.6, desc="Extracting fonts...")
font_stats = self.extract_fonts(doc, assets_dir)
# Extract vector graphics
progress(0.8, desc="Extracting vector graphics...")
vector_stats = self.extract_vectors(doc, assets_dir)
# Create summary
summary = self.create_summary(metadata, text_stats, image_stats, font_stats, vector_stats)
# Create zip file
progress(0.9, desc="Creating zip archive...")
zip_path = self.create_zip(assets_dir, temp_dir)
doc.close()
progress(1.0, desc="Complete!")
self.log("βœ“ Extraction completed successfully!")
return zip_path, "\n".join(self.logs), summary
except Exception as e:
self.log(f"ERROR: {str(e)}", "ERROR")
return None, "\n".join(self.logs), f"Error: {str(e)}"
def extract_metadata(self, doc, assets_dir):
"""Extract PDF metadata"""
try:
metadata = doc.metadata
self.log(f"Extracting metadata...")
# Save metadata as JSON
metadata_path = os.path.join(assets_dir, "metadata.json")
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, indent=2, default=str)
self.log(f"βœ“ Metadata saved to metadata.json")
return metadata
except Exception as e:
self.log(f"⚠ Warning: Could not extract metadata - {str(e)}", "WARNING")
return {}
def extract_text(self, doc, assets_dir):
"""Extract text from PDF"""
try:
text_dir = os.path.join(assets_dir, "text")
os.makedirs(text_dir, exist_ok=True)
total_chars = 0
# Extract text from each page
full_text = []
for page_num, page in enumerate(doc, 1):
text = page.get_text()
full_text.append(f"--- Page {page_num} ---\n{text}\n")
total_chars += len(text)
# Save full text
text_path = os.path.join(text_dir, "full_text.txt")
with open(text_path, 'w', encoding='utf-8') as f:
f.write("\n".join(full_text))
self.log(f"βœ“ Extracted {total_chars:,} characters of text across {len(doc)} pages")
return {"pages": len(doc), "characters": total_chars}
except Exception as e:
self.log(f"⚠ Warning: Could not extract text - {str(e)}", "WARNING")
return {"pages": 0, "characters": 0}
def extract_images(self, doc, assets_dir):
"""Extract images from PDF"""
try:
images_dir = os.path.join(assets_dir, "images")
os.makedirs(images_dir, exist_ok=True)
image_count = 0
for page_num, page in enumerate(doc, 1):
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
try:
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# Save image
image_filename = f"page{page_num}_img{img_index + 1}.{image_ext}"
image_path = os.path.join(images_dir, image_filename)
with open(image_path, "wb") as img_file:
img_file.write(image_bytes)
image_count += 1
except Exception as e:
self.log(f"⚠ Warning: Could not extract image {img_index + 1} from page {page_num}", "WARNING")
self.log(f"βœ“ Extracted {image_count} images")
return {"count": image_count}
except Exception as e:
self.log(f"⚠ Warning: Could not extract images - {str(e)}", "WARNING")
return {"count": 0}
def extract_fonts(self, doc, assets_dir):
"""Extract font information from PDF"""
try:
fonts_dir = os.path.join(assets_dir, "fonts")
os.makedirs(fonts_dir, exist_ok=True)
fonts_info = []
for page_num, page in enumerate(doc, 1):
fonts = page.get_fonts(full=True)
for font in fonts:
font_info = {
"page": page_num,
"xref": font[0],
"name": font[3],
"type": font[1],
"encoding": font[2]
}
fonts_info.append(font_info)
# Save fonts information
if fonts_info:
fonts_path = os.path.join(fonts_dir, "fonts_info.json")
with open(fonts_path, 'w', encoding='utf-8') as f:
json.dump(fonts_info, f, indent=2)
unique_fonts = len(set(f["name"] for f in fonts_info))
self.log(f"βœ“ Found {len(fonts_info)} font references ({unique_fonts} unique fonts)")
else:
self.log(f"β„Ή No fonts found")
return {"total": len(fonts_info), "unique": len(set(f["name"] for f in fonts_info)) if fonts_info else 0}
except Exception as e:
self.log(f"⚠ Warning: Could not extract fonts - {str(e)}", "WARNING")
return {"total": 0, "unique": 0}
def extract_vectors(self, doc, assets_dir):
"""Extract vector graphics information"""
try:
vectors_dir = os.path.join(assets_dir, "vectors")
os.makedirs(vectors_dir, exist_ok=True)
vector_count = 0
# Export each page as SVG (contains vector information)
for page_num, page in enumerate(doc, 1):
try:
svg_content = page.get_svg_image()
svg_path = os.path.join(vectors_dir, f"page{page_num}.svg")
with open(svg_path, 'w', encoding='utf-8') as f:
f.write(svg_content)
vector_count += 1
except Exception as e:
self.log(f"⚠ Could not extract vectors from page {page_num}", "WARNING")
if vector_count > 0:
self.log(f"βœ“ Exported {vector_count} pages as SVG (vector format)")
else:
self.log(f"β„Ή No vector graphics extracted")
return {"count": vector_count}
except Exception as e:
self.log(f"⚠ Warning: Could not extract vectors - {str(e)}", "WARNING")
return {"count": 0}
def create_summary(self, metadata, text_stats, image_stats, font_stats, vector_stats):
"""Create extraction summary"""
summary = f"""
# PDF Asset Extraction Summary
## Document Information
- **Title**: {metadata.get('title', 'N/A')}
- **Author**: {metadata.get('author', 'N/A')}
- **Subject**: {metadata.get('subject', 'N/A')}
- **Creator**: {metadata.get('creator', 'N/A')}
- **Producer**: {metadata.get('producer', 'N/A')}
- **Creation Date**: {metadata.get('creationDate', 'N/A')}
## Extracted Assets
### Text
- **Pages**: {text_stats.get('pages', 0)}
- **Total Characters**: {text_stats.get('characters', 0):,}
### Images
- **Total Images**: {image_stats.get('count', 0)}
### Fonts
- **Total Font References**: {font_stats.get('total', 0)}
- **Unique Fonts**: {font_stats.get('unique', 0)}
### Vector Graphics
- **SVG Pages Exported**: {vector_stats.get('count', 0)}
## Output Structure
```
pdf_assets/
β”œβ”€β”€ metadata.json # PDF metadata
β”œβ”€β”€ text/
β”‚ └── full_text.txt # Extracted text
β”œβ”€β”€ images/ # Extracted images
β”œβ”€β”€ fonts/
β”‚ └── fonts_info.json # Font information
└── vectors/ # SVG exports
```
"""
return summary
def create_zip(self, assets_dir, temp_dir):
"""Create zip file of all extracted assets"""
try:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_filename = f"pdf_assets_{timestamp}.zip"
zip_path = os.path.join(temp_dir, zip_filename)
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(assets_dir):
for file in files:
file_path = os.path.join(root, file)
arcname = os.path.relpath(file_path, temp_dir)
zipf.write(file_path, arcname)
zip_size = os.path.getsize(zip_path) / (1024 * 1024) # MB
self.log(f"βœ“ Created zip archive: {zip_filename} ({zip_size:.2f} MB)")
return zip_path
except Exception as e:
self.log(f"ERROR: Could not create zip file - {str(e)}", "ERROR")
return None
# Initialize extractor
extractor = PDFAssetExtractor()
# Create Gradio interface
with gr.Blocks(title="PDF Asset Extractor") as demo:
gr.Markdown("""
# πŸ“„ PDF Asset Extractor
Extract fonts, icons, vectors, images, text, and metadata from PDF files.
All assets are packaged into a downloadable zip file.
""")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="Upload PDF",
file_types=[".pdf"],
type="filepath"
)
extract_btn = gr.Button("πŸš€ Extract Assets", variant="primary", size="lg")
gr.Markdown("""
### What gets extracted:
- πŸ“ **Text**: Full text content from all pages
- πŸ–ΌοΈ **Images**: All embedded images
- πŸ”€ **Fonts**: Font information and references
- 🎨 **Vectors**: SVG exports of each page
- πŸ“‹ **Metadata**: Document properties and info
""")
with gr.Column(scale=2):
with gr.Tabs():
with gr.Tab("πŸ“Š Summary"):
summary_output = gr.Markdown()
with gr.Tab("πŸ“œ Extraction Logs"):
logs_output = gr.Textbox(
label="Live Logs",
lines=20,
max_lines=30,
interactive=False
)
download_output = gr.File(label="πŸ“¦ Download Assets (ZIP)")
# Connect the extraction function
extract_btn.click(
fn=extractor.extract_assets,
inputs=[pdf_input],
outputs=[download_output, logs_output, summary_output]
)
gr.Markdown("""
---
### πŸ’‘ Tips:
- Larger PDFs may take longer to process
- All extracted assets are organized in folders
- The zip file includes a complete directory structure
- Check the logs tab for detailed extraction information
""")
# Launch the app
if __name__ == "__main__":
demo.launch()