Spaces:
Sleeping
Sleeping
| # pdf_classes.py | |
| from docarray import BaseDoc | |
| from docarray import DocList | |
| from docarray.typing import ImageTensor, NdArray | |
| from typing import Dict, Optional | |
| class PDFSegment(BaseDoc): | |
| page_number: int | |
| segment_type: str # 'text', 'image', 'table', or 'hybrid' | |
| content: Optional[str] | |
| image: Optional[ImageTensor] | |
| position: Dict[str, int] # {x, y, width, height} | |
| relationships: Dict[str, Optional[str]] # {'prev': id, 'next': id, 'parent': id} | |
| embedding: Optional[NdArray[768]] | |
| class PDFPage(BaseDoc): | |
| page_number: int | |
| screenshot: ImageTensor | |
| embedding: Optional[NdArray[768]] = None | |
| class RichPDFDocument(BaseDoc): | |
| file_path: str | |
| num_pages: int | |
| segments: DocList[PDFSegment] | |
| pages: DocList[PDFPage] | |