File size: 747 Bytes
63105da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from bs4 import BeautifulSoup
from pathlib import Path
from .parser_base import ParserBase
from typing import Tuple, Dict 

class HTMLParser(ParserBase):
    def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
        with open(filepath, "r", encoding="utf-8") as f:
            html = f.read()
        soup = BeautifulSoup(html, "html.parser")
        # Extract all visible text (ignore script, style)
        for tag in soup(["script", "style"]):
            tag.decompose()
        text = soup.get_text(separator="\n", strip=True)
        metadata = {
            "filetype": "html",
            "filename": str(Path(filepath).name),
            "length": len(text)
        }
        return text, metadata