Spaces:
Sleeping
Sleeping
File size: 840 Bytes
63105da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | import fitz
from pathlib import Path
from .parser_base import ParserBase
from typing import Tuple, Dict
class PDFParser(ParserBase):
def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
doc = fitz.open(filepath)
text = ""
pages_metadata = []
for i, page in enumerate(doc):
page_text = page.get_text()
text += page_text + "\n"
pages_metadata.append({
"page_num": i+1,
"length": len(page_text),
'first_100_chars': page_text[:100],
})
metadata = {
"filetype": "pdf",
"n_pages": doc.page_count,
"pages": pages_metadata,
"filename": str(Path(filepath).name)
}
return text, metadata
|