Spaces:

samithcs
/

RAG_Book_QA_System

Sleeping

File size: 747 Bytes

63105da

from bs4 import BeautifulSoup
from pathlib import Path
from .parser_base import ParserBase
from typing import Tuple, Dict 

class HTMLParser(ParserBase):
    def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
        with open(filepath, "r", encoding="utf-8") as f:
            html = f.read()
        soup = BeautifulSoup(html, "html.parser")
        # Extract all visible text (ignore script, style)
        for tag in soup(["script", "style"]):
            tag.decompose()
        text = soup.get_text(separator="\n", strip=True)
        metadata = {
            "filetype": "html",
            "filename": str(Path(filepath).name),
            "length": len(text)
        }
        return text, metadata