Spaces:
Running
Running
File size: 3,313 Bytes
5e95e09 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | import fitz # PyMuPDF
from typing import List, Dict, Any
import logging
logger = logging.getLogger(__name__)
def extract_pdf_text(pdf_path: str) -> str:
"""Extract all text from a PDF file"""
try:
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
except Exception as e:
logger.error(f"Error extracting PDF text: {e}")
return ""
def get_pdf_metadata(pdf_path: str) -> Dict[str, Any]:
"""Get detailed metadata from PDF"""
try:
doc = fitz.open(pdf_path)
metadata = doc.metadata
doc.close()
return metadata
except Exception as e:
logger.error(f"Error getting PDF metadata: {e}")
return {}
def count_pdf_pages(pdf_path: str) -> int:
"""Count the number of pages in a PDF"""
try:
doc = fitz.open(pdf_path)
page_count = len(doc)
doc.close()
return page_count
except Exception as e:
logger.error(f"Error counting PDF pages: {e}")
return 0
def split_pdf(pdf_path: str, output_dir: str, pages_per_file: int = 1) -> List[str]:
"""Split PDF into multiple files"""
try:
doc = fitz.open(pdf_path)
output_files = []
for i in range(0, len(doc), pages_per_file):
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=i, to_page=min(i + pages_per_file - 1, len(doc) - 1))
output_path = os.path.join(output_dir, f"split_{i//pages_per_file + 1}.pdf")
new_doc.save(output_path)
new_doc.close()
output_files.append(output_path)
doc.close()
return output_files
except Exception as e:
logger.error(f"Error splitting PDF: {e}")
return []
def merge_pdfs(pdf_paths: List[str], output_path: str) -> bool:
"""Merge multiple PDF files into one"""
try:
merger = fitz.open()
for pdf_path in pdf_paths:
merger.insert_pdf(fitz.open(pdf_path))
merger.save(output_path)
merger.close()
return True
except Exception as e:
logger.error(f"Error merging PDFs: {e}")
return False
def rotate_pdf_pages(pdf_path: str, output_path: str, rotation: int = 90) -> bool:
"""Rotate all pages in a PDF by specified degrees"""
try:
doc = fitz.open(pdf_path)
for page in doc:
page.set_rotation(rotation)
doc.save(output_path)
doc.close()
return True
except Exception as e:
logger.error(f"Error rotating PDF: {e}")
return False
def compress_pdf(pdf_path: str, output_path: str, quality: int = 80) -> bool:
"""Compress PDF file"""
try:
doc = fitz.open(pdf_path)
for page in doc:
pix = page.get_pixmap()
img = page.get_pixmap()
# This is a simplified compression - in real implementation,
# you would use more sophisticated compression techniques
page.set_pixmap(img)
doc.save(output_path, garbage=4, deflate=True, clean=True)
doc.close()
return True
except Exception as e:
logger.error(f"Error compressing PDF: {e}")
return False<|end_of_box|> |