File size: 603 Bytes
63105da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from docx import Document
from pathlib import Path
from .parser_base import ParserBase
from typing import Tuple, Dict 

class DOCXParser(ParserBase):
    def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
        doc = Document(filepath)
        text_list = []
        for para in doc.paragraphs:
            text_list.append(para.text)
        text = "\n".join(text_list)
        metadata = {
            "filetype": "docx",
            "filename": str(Path(filepath).name),
            "num_paragraphs": len(doc.paragraphs)
        }
        return text, metadata