Spaces:
Running
Running
| import fitz # PyMuPDF | |
| from typing import List, Dict, Any | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| def extract_pdf_text(pdf_path: str) -> str: | |
| """Extract all text from a PDF file""" | |
| try: | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| doc.close() | |
| return text | |
| except Exception as e: | |
| logger.error(f"Error extracting PDF text: {e}") | |
| return "" | |
| def get_pdf_metadata(pdf_path: str) -> Dict[str, Any]: | |
| """Get detailed metadata from PDF""" | |
| try: | |
| doc = fitz.open(pdf_path) | |
| metadata = doc.metadata | |
| doc.close() | |
| return metadata | |
| except Exception as e: | |
| logger.error(f"Error getting PDF metadata: {e}") | |
| return {} | |
| def count_pdf_pages(pdf_path: str) -> int: | |
| """Count the number of pages in a PDF""" | |
| try: | |
| doc = fitz.open(pdf_path) | |
| page_count = len(doc) | |
| doc.close() | |
| return page_count | |
| except Exception as e: | |
| logger.error(f"Error counting PDF pages: {e}") | |
| return 0 | |
| def split_pdf(pdf_path: str, output_dir: str, pages_per_file: int = 1) -> List[str]: | |
| """Split PDF into multiple files""" | |
| try: | |
| doc = fitz.open(pdf_path) | |
| output_files = [] | |
| for i in range(0, len(doc), pages_per_file): | |
| new_doc = fitz.open() | |
| new_doc.insert_pdf(doc, from_page=i, to_page=min(i + pages_per_file - 1, len(doc) - 1)) | |
| output_path = os.path.join(output_dir, f"split_{i//pages_per_file + 1}.pdf") | |
| new_doc.save(output_path) | |
| new_doc.close() | |
| output_files.append(output_path) | |
| doc.close() | |
| return output_files | |
| except Exception as e: | |
| logger.error(f"Error splitting PDF: {e}") | |
| return [] | |
| def merge_pdfs(pdf_paths: List[str], output_path: str) -> bool: | |
| """Merge multiple PDF files into one""" | |
| try: | |
| merger = fitz.open() | |
| for pdf_path in pdf_paths: | |
| merger.insert_pdf(fitz.open(pdf_path)) | |
| merger.save(output_path) | |
| merger.close() | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error merging PDFs: {e}") | |
| return False | |
| def rotate_pdf_pages(pdf_path: str, output_path: str, rotation: int = 90) -> bool: | |
| """Rotate all pages in a PDF by specified degrees""" | |
| try: | |
| doc = fitz.open(pdf_path) | |
| for page in doc: | |
| page.set_rotation(rotation) | |
| doc.save(output_path) | |
| doc.close() | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error rotating PDF: {e}") | |
| return False | |
| def compress_pdf(pdf_path: str, output_path: str, quality: int = 80) -> bool: | |
| """Compress PDF file""" | |
| try: | |
| doc = fitz.open(pdf_path) | |
| for page in doc: | |
| pix = page.get_pixmap() | |
| img = page.get_pixmap() | |
| # This is a simplified compression - in real implementation, | |
| # you would use more sophisticated compression techniques | |
| page.set_pixmap(img) | |
| doc.save(output_path, garbage=4, deflate=True, clean=True) | |
| doc.close() | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error compressing PDF: {e}") | |
| return False<|end_of_box|> |