Spaces:
Build error
Build error
Ilia Tambovtsev commited on
Commit ·
42b8733
1
Parent(s): 80be769
feat: add page2image chain
Browse files- src/pdf_utils/chains.py +45 -0
src/pdf_utils/chains.py
CHANGED
|
@@ -23,6 +23,51 @@ from .pdf2image import page2image
|
|
| 23 |
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
class Pdf2ImageChain(Chain):
|
| 28 |
"""Chain for converting PDF pages to PIL Images using PyMuPDF"""
|
|
|
|
| 23 |
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
| 26 |
+
class Page2ImageChain(Chain):
|
| 27 |
+
"""Chain for converting PyMuPDF page to PIL Image"""
|
| 28 |
+
|
| 29 |
+
def __init__(self, default_dpi: int = 72, **kwargs):
|
| 30 |
+
"""Initialize Page to Image conversion chain
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
default_dpi: Default resolution for PDF rendering
|
| 34 |
+
"""
|
| 35 |
+
super().__init__(**kwargs)
|
| 36 |
+
self._default_dpi = default_dpi
|
| 37 |
+
|
| 38 |
+
@property
|
| 39 |
+
def input_keys(self) -> List[str]:
|
| 40 |
+
"""Required input keys"""
|
| 41 |
+
return ["page"]
|
| 42 |
+
|
| 43 |
+
@property
|
| 44 |
+
def output_keys(self) -> List[str]:
|
| 45 |
+
"""Output keys provided by the chain"""
|
| 46 |
+
return ["image"]
|
| 47 |
+
|
| 48 |
+
def _call(
|
| 49 |
+
self,
|
| 50 |
+
inputs: Dict[str, Any],
|
| 51 |
+
run_manager: Optional[CallbackManagerForChainRun] = None
|
| 52 |
+
) -> Dict[str, Any]:
|
| 53 |
+
"""Convert PyMuPDF page to PIL Image
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
inputs: Dictionary containing:
|
| 57 |
+
- page: PyMuPDF page object
|
| 58 |
+
- dpi: Optional DPI value for rendering
|
| 59 |
+
run_manager: Callback manager
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
Dictionary with PIL Image
|
| 63 |
+
"""
|
| 64 |
+
page: fitz.Page = inputs["page"]
|
| 65 |
+
dpi = get_param_or_default(inputs, "dpi", self._default_dpi)
|
| 66 |
+
|
| 67 |
+
image = page2image(page, dpi)
|
| 68 |
+
|
| 69 |
+
return dict(image=image)
|
| 70 |
+
|
| 71 |
|
| 72 |
class Pdf2ImageChain(Chain):
|
| 73 |
"""Chain for converting PDF pages to PIL Images using PyMuPDF"""
|