OTT_Bot / kb_builder /parser /html_parser.py
OnlyTheTruth03's picture
Initial RAG bot
709c859
raw
history blame contribute delete
428 Bytes
from bs4 import BeautifulSoup
def parse_lesson(html):
soup = BeautifulSoup(html, "lxml")
lesson_title = soup.find("h1").get_text(strip=True)
sections = []
for sec in soup.find_all(["section", "article"]):
header = sec.find(["h2", "h3", "h4"])
text = sec.get_text("\n", strip=True)
if text:
sections.append({
"heading": header.get_text(strip=True) if header else "General",
"text": text
})
return lesson_title, sections