samithcs's picture
Pipeline added
63105da verified
raw
history blame contribute delete
747 Bytes
from bs4 import BeautifulSoup
from pathlib import Path
from .parser_base import ParserBase
from typing import Tuple, Dict
class HTMLParser(ParserBase):
def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
with open(filepath, "r", encoding="utf-8") as f:
html = f.read()
soup = BeautifulSoup(html, "html.parser")
# Extract all visible text (ignore script, style)
for tag in soup(["script", "style"]):
tag.decompose()
text = soup.get_text(separator="\n", strip=True)
metadata = {
"filetype": "html",
"filename": str(Path(filepath).name),
"length": len(text)
}
return text, metadata