File size: 3,313 Bytes
5e95e09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import fitz  # PyMuPDF
from typing import List, Dict, Any
import logging

logger = logging.getLogger(__name__)

def extract_pdf_text(pdf_path: str) -> str:
    """Extract all text from a PDF file"""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        logger.error(f"Error extracting PDF text: {e}")
        return ""

def get_pdf_metadata(pdf_path: str) -> Dict[str, Any]:
    """Get detailed metadata from PDF"""
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata
        doc.close()
        return metadata
    except Exception as e:
        logger.error(f"Error getting PDF metadata: {e}")
        return {}

def count_pdf_pages(pdf_path: str) -> int:
    """Count the number of pages in a PDF"""
    try:
        doc = fitz.open(pdf_path)
        page_count = len(doc)
        doc.close()
        return page_count
    except Exception as e:
        logger.error(f"Error counting PDF pages: {e}")
        return 0

def split_pdf(pdf_path: str, output_dir: str, pages_per_file: int = 1) -> List[str]:
    """Split PDF into multiple files"""
    try:
        doc = fitz.open(pdf_path)
        output_files = []
        
        for i in range(0, len(doc), pages_per_file):
            new_doc = fitz.open()
            new_doc.insert_pdf(doc, from_page=i, to_page=min(i + pages_per_file - 1, len(doc) - 1))
            
            output_path = os.path.join(output_dir, f"split_{i//pages_per_file + 1}.pdf")
            new_doc.save(output_path)
            new_doc.close()
            output_files.append(output_path)
            
        doc.close()
        return output_files
    except Exception as e:
        logger.error(f"Error splitting PDF: {e}")
        return []

def merge_pdfs(pdf_paths: List[str], output_path: str) -> bool:
    """Merge multiple PDF files into one"""
    try:
        merger = fitz.open()
        for pdf_path in pdf_paths:
            merger.insert_pdf(fitz.open(pdf_path))
        merger.save(output_path)
        merger.close()
        return True
    except Exception as e:
        logger.error(f"Error merging PDFs: {e}")
        return False

def rotate_pdf_pages(pdf_path: str, output_path: str, rotation: int = 90) -> bool:
    """Rotate all pages in a PDF by specified degrees"""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            page.set_rotation(rotation)
        doc.save(output_path)
        doc.close()
        return True
    except Exception as e:
        logger.error(f"Error rotating PDF: {e}")
        return False

def compress_pdf(pdf_path: str, output_path: str, quality: int = 80) -> bool:
    """Compress PDF file"""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            pix = page.get_pixmap()
            img = page.get_pixmap()
            # This is a simplified compression - in real implementation, 
            # you would use more sophisticated compression techniques
            page.set_pixmap(img)
        doc.save(output_path, garbage=4, deflate=True, clean=True)
        doc.close()
        return True
    except Exception as e:
        logger.error(f"Error compressing PDF: {e}")
        return False<|end_of_box|>