Spaces:

snakeeee
/

scholar-rag-engine

Sleeping

File size: 1,259 Bytes

1505bbf

import re

def chunk_text(text, source, chunk_size=120):

    sentences = re.split(r'(?<=[.!?])\s+', text)

    chunks = []
    current = []
    length = 0

    for s in sentences:

        s = s.strip()

        # remove exam noise
        if any(x in s for x in [
            "APRIL/MAY",
            "CO1",
            "Marks",
            "Bloom",
            "Unit",
            "Semester"
        ]):
            continue

        words = s.split()

        if len(words) < 5:
            continue

        if length + len(words) > chunk_size:

            chunks.append({
                "source": source,
                "text": " ".join(current)
            })

            current = []
            length = 0

        current.append(s)
        length += len(words)

    if current:
        chunks.append({
            "source": source,
            "text": " ".join(current)
        })

    return chunks
def compress_context(text, question):

    sentences = text.split(". ")

    keywords = question.lower().split()

    scored = []

    for s in sentences:

        score = sum(1 for k in keywords if k in s.lower())

        scored.append((score, s))

    scored.sort(reverse=True)

    top = [s for _, s in scored[:3]]

    return ". ".join(top)