Spaces:
Sleeping
Sleeping
| import fitz | |
| from pathlib import Path | |
| from .parser_base import ParserBase | |
| from typing import Tuple, Dict | |
| class PDFParser(ParserBase): | |
| def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]: | |
| doc = fitz.open(filepath) | |
| text = "" | |
| pages_metadata = [] | |
| for i, page in enumerate(doc): | |
| page_text = page.get_text() | |
| text += page_text + "\n" | |
| pages_metadata.append({ | |
| "page_num": i+1, | |
| "length": len(page_text), | |
| 'first_100_chars': page_text[:100], | |
| }) | |
| metadata = { | |
| "filetype": "pdf", | |
| "n_pages": doc.page_count, | |
| "pages": pages_metadata, | |
| "filename": str(Path(filepath).name) | |
| } | |
| return text, metadata | |