Semantic_File / indexer /chunker.py
JackSparrow89's picture
Upload 65 files
bb04c5f verified
# indexer/chunker.py
class Chunker:
"""
Splits extracted text into overlapping chunks using a sliding window.
Each chunk will later be embedded as a separate vector.
Why chunk at all?
- Embedding models have a token limit (typically 256-512 tokens)
- A 50-page PDF as one embedding would lose detail
- Small chunks let us pinpoint the EXACT passage that matches a query
Why overlap?
- A sentence at the boundary might get cut in half
- Overlap ensures every sentence appears fully in at least one chunk
"""
def __init__(self, chunk_size=500, overlap=50):
"""
Args:
chunk_size (int) β€” max number of words per chunk
overlap (int) β€” number of words shared between consecutive chunks
TODO:
- Store chunk_size and overlap as instance variables
- Validate that overlap is less than chunk_size
(if overlap >= chunk_size, chunks would never advance forward)
"""
self.chunk_size = chunk_size
self.overlap = overlap
if self.overlap >= self.chunk_size:
raise ValueError("Overlap must be smaller than chunk_size")
def chunk_text(self, text):
"""
Split a text string into overlapping chunks based on word count.
Args:
text (str) β€” the full extracted text from a file
Returns:
list[str] β€” list of text chunks
Example with chunk_size=5, overlap=2:
text = "The quick brown fox jumps over the lazy dog today"
words = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "today"]
Chunk 0: words[0:5] β†’ "The quick brown fox jumps"
Chunk 1: words[3:8] β†’ "fox jumps over the lazy" (step = 5-2 = 3)
Chunk 2: words[6:11] β†’ "the lazy dog today" (step = 3 again)
TODO:
- Split the text into a list of words using .split()
- If the word list is empty, return an empty list
- Calculate step size: step = chunk_size - overlap
- Use a loop starting at 0, stepping by 'step', up to len(words)
- At each position, take words[i : i + chunk_size]
- Join each slice back into a string with " ".join()
- Return the list of chunk strings
HINT:
words = text.split()
step = self.chunk_size - self.overlap
for i in range(0, len(words), step):
chunk_words = words[i : i + self.chunk_size]
"""
words = text.split()
if not words:
return []
step = self.chunk_size - self.overlap
chunks = []
for i in range(0, len(words), step):
chunk_words = words[i:i+self.chunk_size]
chunks.append(" ".join(chunk_words))
return chunks
def chunk_file(self, text, filepath):
"""
Chunk a file's text and attach metadata to each chunk.
This metadata will be stored in SQLite alongside the vectors.
Args:
text (str) β€” extracted text content
filepath (str) β€” source file path (for metadata)
Returns:
list[dict] β€” each dict contains:
{
"text": "the chunk text...",
"filepath": "/path/to/file.pdf",
"chunk_index": 0, # position in the file
"total_chunks": 5 # how many chunks this file produced
}
TODO:
- Call self.chunk_text(text) to get the list of chunk strings
- Build a list of dicts, one per chunk, with the fields shown above
- chunk_index starts at 0
HINT:
chunks = self.chunk_text(text)
for i, chunk in enumerate(chunks):
# build the dict here
"""
chunks = self.chunk_text(text)
results = []
for i, chunk in enumerate(chunks):
results.append({
"text": chunk,
"filepath": filepath,
"chunk_index": i,
})
return results
# --- Test it ---
if __name__ == "__main__":
chunker = Chunker(chunk_size=10, overlap=3)
sample = (
"The quick brown fox jumps over the lazy dog. "
"Semantic search finds files by meaning not just keywords. "
"This is a test of the chunking system for our project."
)
chunks = chunker.chunk_text(sample)
print(f"Text has {len(sample.split())} words β†’ {len(chunks)} chunks\n")
for i, chunk in enumerate(chunks):
print(f"Chunk {i}: {chunk}")
print("\n--- With metadata ---")
results = chunker.chunk_file(sample, "/test/sample.txt")
for r in results:
print(f"[{r['chunk_index']}] {r['text'][:60]}...")