Spaces:
Sleeping
Sleeping
File size: 747 Bytes
63105da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
from bs4 import BeautifulSoup
from pathlib import Path
from .parser_base import ParserBase
from typing import Tuple, Dict
class HTMLParser(ParserBase):
def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
with open(filepath, "r", encoding="utf-8") as f:
html = f.read()
soup = BeautifulSoup(html, "html.parser")
# Extract all visible text (ignore script, style)
for tag in soup(["script", "style"]):
tag.decompose()
text = soup.get_text(separator="\n", strip=True)
metadata = {
"filetype": "html",
"filename": str(Path(filepath).name),
"length": len(text)
}
return text, metadata |