scholar-rag-engine / chunking.py
snakeeee's picture
Initial commit - Scholar RAG Engine
1505bbf
import re
def chunk_text(text, source, chunk_size=120):
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current = []
length = 0
for s in sentences:
s = s.strip()
# remove exam noise
if any(x in s for x in [
"APRIL/MAY",
"CO1",
"Marks",
"Bloom",
"Unit",
"Semester"
]):
continue
words = s.split()
if len(words) < 5:
continue
if length + len(words) > chunk_size:
chunks.append({
"source": source,
"text": " ".join(current)
})
current = []
length = 0
current.append(s)
length += len(words)
if current:
chunks.append({
"source": source,
"text": " ".join(current)
})
return chunks
def compress_context(text, question):
sentences = text.split(". ")
keywords = question.lower().split()
scored = []
for s in sentences:
score = sum(1 for k in keywords if k in s.lower())
scored.append((score, s))
scored.sort(reverse=True)
top = [s for _, s in scored[:3]]
return ". ".join(top)