Ilia Tambovtsev commited on
Commit
61053e3
·
1 Parent(s): c490faa

feat: add pipeline for one pdf

Browse files
Files changed (1) hide show
  1. src/pdf_utils/chains.py +154 -2
src/pdf_utils/chains.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List, Dict, Any, Sequence
2
  from pathlib import Path
3
  import logging
4
  import base64
@@ -10,14 +10,166 @@ from langchain.prompts import ChatPromptTemplate
10
  from langchain.schema.messages import HumanMessage
11
  from langchain.callbacks.manager import CallbackManagerForChainRun
12
  from langchain_core.output_parsers import StrOutputParser
13
- from langchain.pydantic_v1 import Extra
14
 
15
  import pdf2image
 
 
 
 
 
16
 
17
  from config.navigator import Navigator
18
 
19
  logger = logging.getLogger(__name__)
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  class PDFLoaderChain(Chain):
22
  """Chain for loading PDF paths from weird-slides directory"""
23
 
 
1
+ from typing import List, Dict, Any, Sequence, Optional
2
  from pathlib import Path
3
  import logging
4
  import base64
 
10
  from langchain.schema.messages import HumanMessage
11
  from langchain.callbacks.manager import CallbackManagerForChainRun
12
  from langchain_core.output_parsers import StrOutputParser
 
13
 
14
  import pdf2image
15
+ import fitz
16
+
17
+ from io import BytesIO
18
+ from PIL import Image
19
+ from .chain_funcs import get_param_or_default
20
 
21
  from config.navigator import Navigator
22
 
23
  logger = logging.getLogger(__name__)
24
 
25
+
26
+ class Pdf2ImageChain(Chain):
27
+ """Chain for converting PDF pages to PIL Images using PyMuPDF"""
28
+
29
+ navigator: Navigator = Navigator()
30
+
31
+ def __init__(
32
+ self,
33
+ default_dpi: int = 72,
34
+ save_images: bool = False,
35
+ paths_only: bool = False,
36
+ **kwargs
37
+ ):
38
+ """Initialize PDF to Image conversion chain
39
+
40
+ Args:
41
+ navigator: Project paths navigator
42
+ dpi: Resolution for PDF rendering
43
+ save_images: Whether to save images to interim folder
44
+ paths_only: When true, save images and return only paths to them
45
+ """
46
+ super().__init__(**kwargs)
47
+ self._default_dpi = default_dpi
48
+ self._save_images = save_images
49
+ self._paths_only = paths_only
50
+
51
+ @property
52
+ def input_keys(self) -> List[str]:
53
+ return ["pdf_path"]
54
+
55
+ @property
56
+ def output_keys(self) -> List[str]:
57
+ return ["images", "image_paths"]
58
+
59
+ def _save_image(
60
+ self,
61
+ image: Image.Image,
62
+ presentation_name: str,
63
+ page_idx: int
64
+ ) -> Path:
65
+ """Save PIL image to interim folder with standardized naming
66
+
67
+ Args:
68
+ image: PIL Image to save
69
+ presentation_name: Name of the presentation (without extension)
70
+ page_idx: Zero-based page number
71
+
72
+ Returns:
73
+ Path to saved image
74
+ """
75
+ interim_path = self.navigator.get_interim_path(presentation_name)
76
+ output_path = interim_path / f"{presentation_name}_page_{page_idx:03d}_dpi_{self.dpi}.png"
77
+ image.save(output_path, "PNG")
78
+ return output_path
79
+
80
+ def _call(
81
+ self,
82
+ inputs: Dict[str, Any],
83
+ run_manager: Optional[CallbackManagerForChainRun] = None
84
+ ) -> Dict[str, Any]:
85
+ """Convert PDF pages to PIL Images
86
+
87
+ Args:
88
+ inputs: Dictionary with pdf_path key
89
+ run_manager: Callback manager
90
+
91
+ Returns:
92
+ Dictionary with list of PIL Images
93
+ """
94
+ pdf_path = Path(inputs["pdf_path"])
95
+ images = []
96
+ saved_paths = []
97
+
98
+ # Open PDF document
99
+ pdf_document = fitz.open(pdf_path)
100
+
101
+ # Convert selected or all pages
102
+ selected_pages = get_param_or_default(inputs, "selected_pages", range(len(pdf_document)))
103
+
104
+ for page_num in selected_pages:
105
+ # Select pdf page
106
+ page = pdf_document[page_num]
107
+
108
+ # Convert pdf page to pixmap
109
+ dpi = get_param_or_default(inputs, "dpi", self._default_dpi)
110
+ pix = page.get_pixmap(dpi=dpi)
111
+
112
+ # Convert pixmap to PIL Image
113
+ img = Image.frombytes(
114
+ "RGB",
115
+ (pix.width, pix.height),
116
+ pix.samples
117
+ )
118
+
119
+ if self._save_images or self._paths_only:
120
+ saved_path = self._save_image(img, pdf_path.stem, page_num)
121
+ saved_paths.append(saved_path)
122
+
123
+ images.append(img)
124
+
125
+ pdf_document.close()
126
+
127
+ # Form the output dict
128
+ result = dict(images=None, image_paths=None)
129
+ if not self._paths_only:
130
+ result["images"] = images
131
+ if self._save_images or self._paths_only:
132
+ result["image_paths"] = saved_paths
133
+
134
+ return result
135
+
136
+
137
+ class ImageEncodeChain(Chain):
138
+ """Chain for encoding PIL Images to base64 strings"""
139
+
140
+ @property
141
+ def input_keys(self) -> List[str]:
142
+ return ["image"]
143
+
144
+ @property
145
+ def output_keys(self) -> List[str]:
146
+ return ["image_encoded"]
147
+
148
+ def _call(
149
+ self,
150
+ inputs: Dict[str, Any],
151
+ run_manager: Optional[CallbackManagerForChainRun] = None
152
+ ) -> Dict[str, Any]:
153
+ """Encode PIL Image to base64 string
154
+
155
+ Args:
156
+ inputs: Dictionary with PIL Image
157
+ run_manager: Callback manager
158
+
159
+ Returns:
160
+ Dictionary with base64 encoded image string
161
+ """
162
+ image: Image.Image = inputs["image"]
163
+
164
+ # Save image to bytes buffer
165
+ buffer = BytesIO()
166
+ image.save(buffer, format="PNG")
167
+
168
+ # Encode to base64
169
+ encoded = base64.b64encode(buffer.getvalue()).decode("utf-8")
170
+
171
+ return dict(image_encoded=encoded)
172
+
173
  class PDFLoaderChain(Chain):
174
  """Chain for loading PDF paths from weird-slides directory"""
175