Spaces:
Sleeping
Sleeping
| import re | |
| def chunk_text(text, source, chunk_size=120): | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| chunks = [] | |
| current = [] | |
| length = 0 | |
| for s in sentences: | |
| s = s.strip() | |
| # remove exam noise | |
| if any(x in s for x in [ | |
| "APRIL/MAY", | |
| "CO1", | |
| "Marks", | |
| "Bloom", | |
| "Unit", | |
| "Semester" | |
| ]): | |
| continue | |
| words = s.split() | |
| if len(words) < 5: | |
| continue | |
| if length + len(words) > chunk_size: | |
| chunks.append({ | |
| "source": source, | |
| "text": " ".join(current) | |
| }) | |
| current = [] | |
| length = 0 | |
| current.append(s) | |
| length += len(words) | |
| if current: | |
| chunks.append({ | |
| "source": source, | |
| "text": " ".join(current) | |
| }) | |
| return chunks | |
| def compress_context(text, question): | |
| sentences = text.split(". ") | |
| keywords = question.lower().split() | |
| scored = [] | |
| for s in sentences: | |
| score = sum(1 for k in keywords if k in s.lower()) | |
| scored.append((score, s)) | |
| scored.sort(reverse=True) | |
| top = [s for _, s in scored[:3]] | |
| return ". ".join(top) |