hzaustingg commited on
Commit
5e95e09
·
verified ·
1 Parent(s): e3dde69

Upload utils.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. utils.py +104 -0
utils.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ from typing import List, Dict, Any
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ def extract_pdf_text(pdf_path: str) -> str:
8
+ """Extract all text from a PDF file"""
9
+ try:
10
+ doc = fitz.open(pdf_path)
11
+ text = ""
12
+ for page in doc:
13
+ text += page.get_text()
14
+ doc.close()
15
+ return text
16
+ except Exception as e:
17
+ logger.error(f"Error extracting PDF text: {e}")
18
+ return ""
19
+
20
+ def get_pdf_metadata(pdf_path: str) -> Dict[str, Any]:
21
+ """Get detailed metadata from PDF"""
22
+ try:
23
+ doc = fitz.open(pdf_path)
24
+ metadata = doc.metadata
25
+ doc.close()
26
+ return metadata
27
+ except Exception as e:
28
+ logger.error(f"Error getting PDF metadata: {e}")
29
+ return {}
30
+
31
+ def count_pdf_pages(pdf_path: str) -> int:
32
+ """Count the number of pages in a PDF"""
33
+ try:
34
+ doc = fitz.open(pdf_path)
35
+ page_count = len(doc)
36
+ doc.close()
37
+ return page_count
38
+ except Exception as e:
39
+ logger.error(f"Error counting PDF pages: {e}")
40
+ return 0
41
+
42
+ def split_pdf(pdf_path: str, output_dir: str, pages_per_file: int = 1) -> List[str]:
43
+ """Split PDF into multiple files"""
44
+ try:
45
+ doc = fitz.open(pdf_path)
46
+ output_files = []
47
+
48
+ for i in range(0, len(doc), pages_per_file):
49
+ new_doc = fitz.open()
50
+ new_doc.insert_pdf(doc, from_page=i, to_page=min(i + pages_per_file - 1, len(doc) - 1))
51
+
52
+ output_path = os.path.join(output_dir, f"split_{i//pages_per_file + 1}.pdf")
53
+ new_doc.save(output_path)
54
+ new_doc.close()
55
+ output_files.append(output_path)
56
+
57
+ doc.close()
58
+ return output_files
59
+ except Exception as e:
60
+ logger.error(f"Error splitting PDF: {e}")
61
+ return []
62
+
63
+ def merge_pdfs(pdf_paths: List[str], output_path: str) -> bool:
64
+ """Merge multiple PDF files into one"""
65
+ try:
66
+ merger = fitz.open()
67
+ for pdf_path in pdf_paths:
68
+ merger.insert_pdf(fitz.open(pdf_path))
69
+ merger.save(output_path)
70
+ merger.close()
71
+ return True
72
+ except Exception as e:
73
+ logger.error(f"Error merging PDFs: {e}")
74
+ return False
75
+
76
+ def rotate_pdf_pages(pdf_path: str, output_path: str, rotation: int = 90) -> bool:
77
+ """Rotate all pages in a PDF by specified degrees"""
78
+ try:
79
+ doc = fitz.open(pdf_path)
80
+ for page in doc:
81
+ page.set_rotation(rotation)
82
+ doc.save(output_path)
83
+ doc.close()
84
+ return True
85
+ except Exception as e:
86
+ logger.error(f"Error rotating PDF: {e}")
87
+ return False
88
+
89
+ def compress_pdf(pdf_path: str, output_path: str, quality: int = 80) -> bool:
90
+ """Compress PDF file"""
91
+ try:
92
+ doc = fitz.open(pdf_path)
93
+ for page in doc:
94
+ pix = page.get_pixmap()
95
+ img = page.get_pixmap()
96
+ # This is a simplified compression - in real implementation,
97
+ # you would use more sophisticated compression techniques
98
+ page.set_pixmap(img)
99
+ doc.save(output_path, garbage=4, deflate=True, clean=True)
100
+ doc.close()
101
+ return True
102
+ except Exception as e:
103
+ logger.error(f"Error compressing PDF: {e}")
104
+ return False<|end_of_box|>