Spaces:

leekwoon
/

kyobody-chatbot-api

Sleeping

1161dd2 over 1 year ago

1.02 kB

	from bs4 import BeautifulSoup
	import html2text
	from server.logger.logger_config import my_logger as logger


	class AsyncHtmlLoader:
	def __init__(self, file_path: str) -> None:
	logger.info(f"[FILE LOADER] init html, file_path: '{file_path}'")
	self.file_path = file_path

	async def get_content(self) -> str:
	try:
	content = ''
	with open(self.file_path, 'r') as fd:
	html_text = fd.read()

	# Use BeautifulSoup to parse HTML content
	soup = BeautifulSoup(html_text, 'html.parser')
	body_content = soup.find('body')

	# Create an html2text converter
	h = html2text.HTML2Text()
	content = h.handle(str(body_content))

	if not content:
	logger.warning(f"file_path: '{self.file_path}' is empty!")
	return content
	except Exception as e:
	logger.error(f"get_content is failed, exception: {e}")
	return ''