Spaces:

snakeeee
/

scholar-rag-engine

Sleeping

Initial commit - Scholar RAG Engine

1505bbf about 1 month ago

1.26 kB

	import re

	def chunk_text(text, source, chunk_size=120):

	sentences = re.split(r'(?<=[.!?])\s+', text)

	chunks = []
	current = []
	length = 0

	for s in sentences:

	s = s.strip()

	# remove exam noise
	if any(x in s for x in [
	"APRIL/MAY",
	"CO1",
	"Marks",
	"Bloom",
	"Unit",
	"Semester"
	]):
	continue

	words = s.split()

	if len(words) < 5:
	continue

	if length + len(words) > chunk_size:

	chunks.append({
	"source": source,
	"text": " ".join(current)
	})

	current = []
	length = 0

	current.append(s)
	length += len(words)

	if current:
	chunks.append({
	"source": source,
	"text": " ".join(current)
	})

	return chunks
	def compress_context(text, question):

	sentences = text.split(". ")

	keywords = question.lower().split()

	scored = []

	for s in sentences:

	score = sum(1 for k in keywords if k in s.lower())

	scored.append((score, s))

	scored.sort(reverse=True)

	top = [s for _, s in scored[:3]]

	return ". ".join(top)