Spaces:

namelessai
/

PDF-Parse

Sleeping

App Files Files Community

PDF-Parse / app.py

namelessai

Update for Gradio v6 compatibility

b85d2c9 verified 4 months ago

raw

history blame contribute delete

13.5 kB

	import gradio as gr
	import fitz # PyMuPDF
	import zipfile
	import os
	import json
	import io
	from pathlib import Path
	from datetime import datetime
	import tempfile
	import shutil
	from PIL import Image

	class PDFAssetExtractor:
	def __init__(self):
	self.logs = []

	def log(self, message, level="INFO"):
	"""Add a log message with timestamp"""
	timestamp = datetime.now().strftime("%H:%M:%S")
	log_entry = f"[{timestamp}] [{level}] {message}"
	self.logs.append(log_entry)
	return "\n".join(self.logs)

	def extract_assets(self, pdf_file, progress=gr.Progress()):
	"""Main extraction function"""
	self.logs = []

	if pdf_file is None:
	return None, self.log("ERROR: No PDF file provided", "ERROR"), ""

	try:
	# Create temporary directory for extraction
	temp_dir = tempfile.mkdtemp()
	assets_dir = os.path.join(temp_dir, "pdf_assets")
	os.makedirs(assets_dir, exist_ok=True)

	self.log(f"Starting extraction from: {os.path.basename(pdf_file.name)}")
	self.log(f"Temporary directory: {assets_dir}")

	# Open PDF
	doc = fitz.open(pdf_file.name)
	self.log(f"PDF opened successfully - {len(doc)} pages")

	# Extract metadata
	progress(0.1, desc="Extracting metadata...")
	metadata = self.extract_metadata(doc, assets_dir)

	# Extract text
	progress(0.2, desc="Extracting text...")
	text_stats = self.extract_text(doc, assets_dir)

	# Extract images
	progress(0.4, desc="Extracting images...")
	image_stats = self.extract_images(doc, assets_dir)

	# Extract fonts
	progress(0.6, desc="Extracting fonts...")
	font_stats = self.extract_fonts(doc, assets_dir)

	# Extract vector graphics
	progress(0.8, desc="Extracting vector graphics...")
	vector_stats = self.extract_vectors(doc, assets_dir)

	# Create summary
	summary = self.create_summary(metadata, text_stats, image_stats, font_stats, vector_stats)

	# Create zip file
	progress(0.9, desc="Creating zip archive...")
	zip_path = self.create_zip(assets_dir, temp_dir)

	doc.close()

	progress(1.0, desc="Complete!")
	self.log("✓ Extraction completed successfully!")

	return zip_path, "\n".join(self.logs), summary

	except Exception as e:
	self.log(f"ERROR: {str(e)}", "ERROR")
	return None, "\n".join(self.logs), f"Error: {str(e)}"

	def extract_metadata(self, doc, assets_dir):
	"""Extract PDF metadata"""
	try:
	metadata = doc.metadata
	self.log(f"Extracting metadata...")

	# Save metadata as JSON
	metadata_path = os.path.join(assets_dir, "metadata.json")
	with open(metadata_path, 'w', encoding='utf-8') as f:
	json.dump(metadata, f, indent=2, default=str)

	self.log(f"✓ Metadata saved to metadata.json")
	return metadata
	except Exception as e:
	self.log(f"⚠ Warning: Could not extract metadata - {str(e)}", "WARNING")
	return {}

	def extract_text(self, doc, assets_dir):
	"""Extract text from PDF"""
	try:
	text_dir = os.path.join(assets_dir, "text")
	os.makedirs(text_dir, exist_ok=True)

	total_chars = 0

	# Extract text from each page
	full_text = []
	for page_num, page in enumerate(doc, 1):
	text = page.get_text()
	full_text.append(f"--- Page {page_num} ---\n{text}\n")
	total_chars += len(text)

	# Save full text
	text_path = os.path.join(text_dir, "full_text.txt")
	with open(text_path, 'w', encoding='utf-8') as f:
	f.write("\n".join(full_text))

	self.log(f"✓ Extracted {total_chars:,} characters of text across {len(doc)} pages")

	return {"pages": len(doc), "characters": total_chars}
	except Exception as e:
	self.log(f"⚠ Warning: Could not extract text - {str(e)}", "WARNING")
	return {"pages": 0, "characters": 0}

	def extract_images(self, doc, assets_dir):
	"""Extract images from PDF"""
	try:
	images_dir = os.path.join(assets_dir, "images")
	os.makedirs(images_dir, exist_ok=True)

	image_count = 0

	for page_num, page in enumerate(doc, 1):
	image_list = page.get_images(full=True)

	for img_index, img in enumerate(image_list):
	xref = img[0]

	try:
	base_image = doc.extract_image(xref)
	image_bytes = base_image["image"]
	image_ext = base_image["ext"]

	# Save image
	image_filename = f"page{page_num}_img{img_index + 1}.{image_ext}"
	image_path = os.path.join(images_dir, image_filename)

	with open(image_path, "wb") as img_file:
	img_file.write(image_bytes)

	image_count += 1

	except Exception as e:
	self.log(f"⚠ Warning: Could not extract image {img_index + 1} from page {page_num}", "WARNING")

	self.log(f"✓ Extracted {image_count} images")

	return {"count": image_count}
	except Exception as e:
	self.log(f"⚠ Warning: Could not extract images - {str(e)}", "WARNING")
	return {"count": 0}

	def extract_fonts(self, doc, assets_dir):
	"""Extract font information from PDF"""
	try:
	fonts_dir = os.path.join(assets_dir, "fonts")
	os.makedirs(fonts_dir, exist_ok=True)

	fonts_info = []

	for page_num, page in enumerate(doc, 1):
	fonts = page.get_fonts(full=True)

	for font in fonts:
	font_info = {
	"page": page_num,
	"xref": font[0],
	"name": font[3],
	"type": font[1],
	"encoding": font[2]
	}
	fonts_info.append(font_info)

	# Save fonts information
	if fonts_info:
	fonts_path = os.path.join(fonts_dir, "fonts_info.json")
	with open(fonts_path, 'w', encoding='utf-8') as f:
	json.dump(fonts_info, f, indent=2)

	unique_fonts = len(set(f["name"] for f in fonts_info))
	self.log(f"✓ Found {len(fonts_info)} font references ({unique_fonts} unique fonts)")
	else:
	self.log(f"ℹ No fonts found")

	return {"total": len(fonts_info), "unique": len(set(f["name"] for f in fonts_info)) if fonts_info else 0}
	except Exception as e:
	self.log(f"⚠ Warning: Could not extract fonts - {str(e)}", "WARNING")
	return {"total": 0, "unique": 0}

	def extract_vectors(self, doc, assets_dir):
	"""Extract vector graphics information"""
	try:
	vectors_dir = os.path.join(assets_dir, "vectors")
	os.makedirs(vectors_dir, exist_ok=True)

	vector_count = 0

	# Export each page as SVG (contains vector information)
	for page_num, page in enumerate(doc, 1):
	try:
	svg_content = page.get_svg_image()

	svg_path = os.path.join(vectors_dir, f"page{page_num}.svg")
	with open(svg_path, 'w', encoding='utf-8') as f:
	f.write(svg_content)

	vector_count += 1
	except Exception as e:
	self.log(f"⚠ Could not extract vectors from page {page_num}", "WARNING")

	if vector_count > 0:
	self.log(f"✓ Exported {vector_count} pages as SVG (vector format)")
	else:
	self.log(f"ℹ No vector graphics extracted")

	return {"count": vector_count}
	except Exception as e:
	self.log(f"⚠ Warning: Could not extract vectors - {str(e)}", "WARNING")
	return {"count": 0}

	def create_summary(self, metadata, text_stats, image_stats, font_stats, vector_stats):
	"""Create extraction summary"""
	summary = f"""
	# PDF Asset Extraction Summary

	## Document Information
	- Title: {metadata.get('title', 'N/A')}
	- Author: {metadata.get('author', 'N/A')}
	- Subject: {metadata.get('subject', 'N/A')}
	- Creator: {metadata.get('creator', 'N/A')}
	- Producer: {metadata.get('producer', 'N/A')}
	- Creation Date: {metadata.get('creationDate', 'N/A')}

	## Extracted Assets

	### Text
	- Pages: {text_stats.get('pages', 0)}
	- Total Characters: {text_stats.get('characters', 0):,}

	### Images
	- Total Images: {image_stats.get('count', 0)}

	### Fonts
	- Total Font References: {font_stats.get('total', 0)}
	- Unique Fonts: {font_stats.get('unique', 0)}

	### Vector Graphics
	- SVG Pages Exported: {vector_stats.get('count', 0)}

	## Output Structure
	```
	pdf_assets/
	├── metadata.json # PDF metadata
	├── text/
	│ └── full_text.txt # Extracted text
	├── images/ # Extracted images
	├── fonts/
	│ └── fonts_info.json # Font information
	└── vectors/ # SVG exports
	```
	"""
	return summary

	def create_zip(self, assets_dir, temp_dir):
	"""Create zip file of all extracted assets"""
	try:
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	zip_filename = f"pdf_assets_{timestamp}.zip"
	zip_path = os.path.join(temp_dir, zip_filename)

	with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
	for root, dirs, files in os.walk(assets_dir):
	for file in files:
	file_path = os.path.join(root, file)
	arcname = os.path.relpath(file_path, temp_dir)
	zipf.write(file_path, arcname)

	zip_size = os.path.getsize(zip_path) / (1024 * 1024) # MB
	self.log(f"✓ Created zip archive: {zip_filename} ({zip_size:.2f} MB)")

	return zip_path
	except Exception as e:
	self.log(f"ERROR: Could not create zip file - {str(e)}", "ERROR")
	return None


	# Initialize extractor
	extractor = PDFAssetExtractor()

	# Create Gradio interface
	with gr.Blocks(title="PDF Asset Extractor") as demo:
	gr.Markdown("""
	# 📄 PDF Asset Extractor

	Extract fonts, icons, vectors, images, text, and metadata from PDF files.
	All assets are packaged into a downloadable zip file.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	pdf_input = gr.File(
	label="Upload PDF",
	file_types=[".pdf"],
	type="filepath"
	)

	extract_btn = gr.Button("🚀 Extract Assets", variant="primary", size="lg")

	gr.Markdown("""
	### What gets extracted:
	- 📝 Text: Full text content from all pages
	- 🖼️ Images: All embedded images
	- 🔤 Fonts: Font information and references
	- 🎨 Vectors: SVG exports of each page
	- 📋 Metadata: Document properties and info
	""")

	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.Tab("📊 Summary"):
	summary_output = gr.Markdown()

	with gr.Tab("📜 Extraction Logs"):
	logs_output = gr.Textbox(
	label="Live Logs",
	lines=20,
	max_lines=30,
	interactive=False
	)

	download_output = gr.File(label="📦 Download Assets (ZIP)")

	# Connect the extraction function
	extract_btn.click(
	fn=extractor.extract_assets,
	inputs=[pdf_input],
	outputs=[download_output, logs_output, summary_output]
	)

	gr.Markdown("""
	---
	### 💡 Tips:
	- Larger PDFs may take longer to process
	- All extracted assets are organized in folders
	- The zip file includes a complete directory structure
	- Check the logs tab for detailed extraction information
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch()