File size: 1,023 Bytes
1161dd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from bs4 import BeautifulSoup
import html2text
from server.logger.logger_config import my_logger as logger


class AsyncHtmlLoader:
    def __init__(self, file_path: str) -> None:
        logger.info(f"[FILE LOADER] init html, file_path: '{file_path}'")
        self.file_path = file_path

    async def get_content(self) -> str:
        try:
            content = ''
            with open(self.file_path, 'r') as fd:
                html_text = fd.read()

                # Use BeautifulSoup to parse HTML content
                soup = BeautifulSoup(html_text, 'html.parser')
                body_content = soup.find('body')

                # Create an html2text converter
                h = html2text.HTML2Text()
                content = h.handle(str(body_content))

            if not content:
                logger.warning(f"file_path: '{self.file_path}' is empty!")
            return content
        except Exception as e:
            logger.error(f"get_content is failed, exception: {e}")
            return ''