Spaces:

OnlyTheTruth03
/

OTT_Bot

Sleeping

OTT_Bot / kb_builder /parser /html_parser.py

Initial RAG bot

709c859 about 2 months ago

428 Bytes

	from bs4 import BeautifulSoup


	def parse_lesson(html):
	soup = BeautifulSoup(html, "lxml")


	lesson_title = soup.find("h1").get_text(strip=True)
	sections = []


	for sec in soup.find_all(["section", "article"]):
	header = sec.find(["h2", "h3", "h4"])
	text = sec.get_text("\n", strip=True)


	if text:
	sections.append({
	"heading": header.get_text(strip=True) if header else "General",
	"text": text
	})


	return lesson_title, sections