Spaces:
Sleeping
Sleeping
| # indexer/chunker.py | |
| class Chunker: | |
| """ | |
| Splits extracted text into overlapping chunks using a sliding window. | |
| Each chunk will later be embedded as a separate vector. | |
| Why chunk at all? | |
| - Embedding models have a token limit (typically 256-512 tokens) | |
| - A 50-page PDF as one embedding would lose detail | |
| - Small chunks let us pinpoint the EXACT passage that matches a query | |
| Why overlap? | |
| - A sentence at the boundary might get cut in half | |
| - Overlap ensures every sentence appears fully in at least one chunk | |
| """ | |
| def __init__(self, chunk_size=500, overlap=50): | |
| """ | |
| Args: | |
| chunk_size (int) β max number of words per chunk | |
| overlap (int) β number of words shared between consecutive chunks | |
| TODO: | |
| - Store chunk_size and overlap as instance variables | |
| - Validate that overlap is less than chunk_size | |
| (if overlap >= chunk_size, chunks would never advance forward) | |
| """ | |
| self.chunk_size = chunk_size | |
| self.overlap = overlap | |
| if self.overlap >= self.chunk_size: | |
| raise ValueError("Overlap must be smaller than chunk_size") | |
| def chunk_text(self, text): | |
| """ | |
| Split a text string into overlapping chunks based on word count. | |
| Args: | |
| text (str) β the full extracted text from a file | |
| Returns: | |
| list[str] β list of text chunks | |
| Example with chunk_size=5, overlap=2: | |
| text = "The quick brown fox jumps over the lazy dog today" | |
| words = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "today"] | |
| Chunk 0: words[0:5] β "The quick brown fox jumps" | |
| Chunk 1: words[3:8] β "fox jumps over the lazy" (step = 5-2 = 3) | |
| Chunk 2: words[6:11] β "the lazy dog today" (step = 3 again) | |
| TODO: | |
| - Split the text into a list of words using .split() | |
| - If the word list is empty, return an empty list | |
| - Calculate step size: step = chunk_size - overlap | |
| - Use a loop starting at 0, stepping by 'step', up to len(words) | |
| - At each position, take words[i : i + chunk_size] | |
| - Join each slice back into a string with " ".join() | |
| - Return the list of chunk strings | |
| HINT: | |
| words = text.split() | |
| step = self.chunk_size - self.overlap | |
| for i in range(0, len(words), step): | |
| chunk_words = words[i : i + self.chunk_size] | |
| """ | |
| words = text.split() | |
| if not words: | |
| return [] | |
| step = self.chunk_size - self.overlap | |
| chunks = [] | |
| for i in range(0, len(words), step): | |
| chunk_words = words[i:i+self.chunk_size] | |
| chunks.append(" ".join(chunk_words)) | |
| return chunks | |
| def chunk_file(self, text, filepath): | |
| """ | |
| Chunk a file's text and attach metadata to each chunk. | |
| This metadata will be stored in SQLite alongside the vectors. | |
| Args: | |
| text (str) β extracted text content | |
| filepath (str) β source file path (for metadata) | |
| Returns: | |
| list[dict] β each dict contains: | |
| { | |
| "text": "the chunk text...", | |
| "filepath": "/path/to/file.pdf", | |
| "chunk_index": 0, # position in the file | |
| "total_chunks": 5 # how many chunks this file produced | |
| } | |
| TODO: | |
| - Call self.chunk_text(text) to get the list of chunk strings | |
| - Build a list of dicts, one per chunk, with the fields shown above | |
| - chunk_index starts at 0 | |
| HINT: | |
| chunks = self.chunk_text(text) | |
| for i, chunk in enumerate(chunks): | |
| # build the dict here | |
| """ | |
| chunks = self.chunk_text(text) | |
| results = [] | |
| for i, chunk in enumerate(chunks): | |
| results.append({ | |
| "text": chunk, | |
| "filepath": filepath, | |
| "chunk_index": i, | |
| }) | |
| return results | |
| # --- Test it --- | |
| if __name__ == "__main__": | |
| chunker = Chunker(chunk_size=10, overlap=3) | |
| sample = ( | |
| "The quick brown fox jumps over the lazy dog. " | |
| "Semantic search finds files by meaning not just keywords. " | |
| "This is a test of the chunking system for our project." | |
| ) | |
| chunks = chunker.chunk_text(sample) | |
| print(f"Text has {len(sample.split())} words β {len(chunks)} chunks\n") | |
| for i, chunk in enumerate(chunks): | |
| print(f"Chunk {i}: {chunk}") | |
| print("\n--- With metadata ---") | |
| results = chunker.chunk_file(sample, "/test/sample.txt") | |
| for r in results: | |
| print(f"[{r['chunk_index']}] {r['text'][:60]}...") |