Spaces:

Manjiri
/

RAG_JioPay_BOT

Sleeping

RAG_JioPay_BOT / faq_extract.py

Added scraping

6f7653c verified 7 months ago

725 Bytes

	# faq_extract.py (expects HTML or innerText string `page_text`)
	import re

	def extract_faq_pairs(page_text):
	# naive pattern: lines starting with common question stems
	qs = []
	lines = [l.strip() for l in page_text.splitlines() if l.strip()]
	for i,l in enumerate(lines):
	if re.match(r"(?i)^(what\|how\|why\|can\|does\|do\|is\|are)\b", l):
	# join following lines until next question or blank
	ans = []
	for j in range(i+1, min(i+12, len(lines))):
	if re.match(r"(?i)^(what\|how\|why\|can\|does\|do\|is\|are)\b", lines[j]): break
	ans.append(lines[j])
	qs.append({"question": l, "answer": " ".join(ans).strip()})
	return qs