Spaces:
Sleeping
Sleeping
| from bs4 import BeautifulSoup | |
| import html2text | |
| from server.logger.logger_config import my_logger as logger | |
| class AsyncHtmlLoader: | |
| def __init__(self, file_path: str) -> None: | |
| logger.info(f"[FILE LOADER] init html, file_path: '{file_path}'") | |
| self.file_path = file_path | |
| async def get_content(self) -> str: | |
| try: | |
| content = '' | |
| with open(self.file_path, 'r') as fd: | |
| html_text = fd.read() | |
| # Use BeautifulSoup to parse HTML content | |
| soup = BeautifulSoup(html_text, 'html.parser') | |
| body_content = soup.find('body') | |
| # Create an html2text converter | |
| h = html2text.HTML2Text() | |
| content = h.handle(str(body_content)) | |
| if not content: | |
| logger.warning(f"file_path: '{self.file_path}' is empty!") | |
| return content | |
| except Exception as e: | |
| logger.error(f"get_content is failed, exception: {e}") | |
| return '' | |