anycoder-99f49d97 / utils.py
hzaustingg's picture
Upload utils.py with huggingface_hub
5e95e09 verified
import fitz # PyMuPDF
from typing import List, Dict, Any
import logging
logger = logging.getLogger(__name__)
def extract_pdf_text(pdf_path: str) -> str:
"""Extract all text from a PDF file"""
try:
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
except Exception as e:
logger.error(f"Error extracting PDF text: {e}")
return ""
def get_pdf_metadata(pdf_path: str) -> Dict[str, Any]:
"""Get detailed metadata from PDF"""
try:
doc = fitz.open(pdf_path)
metadata = doc.metadata
doc.close()
return metadata
except Exception as e:
logger.error(f"Error getting PDF metadata: {e}")
return {}
def count_pdf_pages(pdf_path: str) -> int:
"""Count the number of pages in a PDF"""
try:
doc = fitz.open(pdf_path)
page_count = len(doc)
doc.close()
return page_count
except Exception as e:
logger.error(f"Error counting PDF pages: {e}")
return 0
def split_pdf(pdf_path: str, output_dir: str, pages_per_file: int = 1) -> List[str]:
"""Split PDF into multiple files"""
try:
doc = fitz.open(pdf_path)
output_files = []
for i in range(0, len(doc), pages_per_file):
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=i, to_page=min(i + pages_per_file - 1, len(doc) - 1))
output_path = os.path.join(output_dir, f"split_{i//pages_per_file + 1}.pdf")
new_doc.save(output_path)
new_doc.close()
output_files.append(output_path)
doc.close()
return output_files
except Exception as e:
logger.error(f"Error splitting PDF: {e}")
return []
def merge_pdfs(pdf_paths: List[str], output_path: str) -> bool:
"""Merge multiple PDF files into one"""
try:
merger = fitz.open()
for pdf_path in pdf_paths:
merger.insert_pdf(fitz.open(pdf_path))
merger.save(output_path)
merger.close()
return True
except Exception as e:
logger.error(f"Error merging PDFs: {e}")
return False
def rotate_pdf_pages(pdf_path: str, output_path: str, rotation: int = 90) -> bool:
"""Rotate all pages in a PDF by specified degrees"""
try:
doc = fitz.open(pdf_path)
for page in doc:
page.set_rotation(rotation)
doc.save(output_path)
doc.close()
return True
except Exception as e:
logger.error(f"Error rotating PDF: {e}")
return False
def compress_pdf(pdf_path: str, output_path: str, quality: int = 80) -> bool:
"""Compress PDF file"""
try:
doc = fitz.open(pdf_path)
for page in doc:
pix = page.get_pixmap()
img = page.get_pixmap()
# This is a simplified compression - in real implementation,
# you would use more sophisticated compression techniques
page.set_pixmap(img)
doc.save(output_path, garbage=4, deflate=True, clean=True)
doc.close()
return True
except Exception as e:
logger.error(f"Error compressing PDF: {e}")
return False<|end_of_box|>