samithcs's picture
Pipeline added
63105da verified
import fitz
from pathlib import Path
from .parser_base import ParserBase
from typing import Tuple, Dict
class PDFParser(ParserBase):
def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
doc = fitz.open(filepath)
text = ""
pages_metadata = []
for i, page in enumerate(doc):
page_text = page.get_text()
text += page_text + "\n"
pages_metadata.append({
"page_num": i+1,
"length": len(page_text),
'first_100_chars': page_text[:100],
})
metadata = {
"filetype": "pdf",
"n_pages": doc.page_count,
"pages": pages_metadata,
"filename": str(Path(filepath).name)
}
return text, metadata