samithcs's picture
Pipeline added
63105da verified
raw
history blame contribute delete
603 Bytes
from docx import Document
from pathlib import Path
from .parser_base import ParserBase
from typing import Tuple, Dict
class DOCXParser(ParserBase):
def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
doc = Document(filepath)
text_list = []
for para in doc.paragraphs:
text_list.append(para.text)
text = "\n".join(text_list)
metadata = {
"filetype": "docx",
"filename": str(Path(filepath).name),
"num_paragraphs": len(doc.paragraphs)
}
return text, metadata