Spaces:

JackSparrow89
/

Semantic_File

Sleeping

App Files Files Community

Semantic_File / indexer /chunker.py

JackSparrow89

Upload 65 files

bb04c5f verified about 2 months ago

raw

history blame contribute delete

4.96 kB

	# indexer/chunker.py


	class Chunker:
	"""
	Splits extracted text into overlapping chunks using a sliding window.
	Each chunk will later be embedded as a separate vector.

	Why chunk at all?
	- Embedding models have a token limit (typically 256-512 tokens)
	- A 50-page PDF as one embedding would lose detail
	- Small chunks let us pinpoint the EXACT passage that matches a query

	Why overlap?
	- A sentence at the boundary might get cut in half
	- Overlap ensures every sentence appears fully in at least one chunk
	"""

	def __init__(self, chunk_size=500, overlap=50):
	"""
	Args:
	chunk_size (int) — max number of words per chunk
	overlap (int) — number of words shared between consecutive chunks

	TODO:
	- Store chunk_size and overlap as instance variables
	- Validate that overlap is less than chunk_size
	(if overlap >= chunk_size, chunks would never advance forward)
	"""
	self.chunk_size = chunk_size
	self.overlap = overlap
	if self.overlap >= self.chunk_size:
	raise ValueError("Overlap must be smaller than chunk_size")

	def chunk_text(self, text):
	"""
	Split a text string into overlapping chunks based on word count.

	Args:
	text (str) — the full extracted text from a file

	Returns:
	list[str] — list of text chunks

	Example with chunk_size=5, overlap=2:
	text = "The quick brown fox jumps over the lazy dog today"
	words = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "today"]

	Chunk 0: words[0:5] → "The quick brown fox jumps"
	Chunk 1: words[3:8] → "fox jumps over the lazy" (step = 5-2 = 3)
	Chunk 2: words[6:11] → "the lazy dog today" (step = 3 again)

	TODO:
	- Split the text into a list of words using .split()
	- If the word list is empty, return an empty list
	- Calculate step size: step = chunk_size - overlap
	- Use a loop starting at 0, stepping by 'step', up to len(words)
	- At each position, take words[i : i + chunk_size]
	- Join each slice back into a string with " ".join()
	- Return the list of chunk strings

	HINT:
	words = text.split()
	step = self.chunk_size - self.overlap
	for i in range(0, len(words), step):
	chunk_words = words[i : i + self.chunk_size]
	"""
	words = text.split()
	if not words:
	return []
	step = self.chunk_size - self.overlap
	chunks = []
	for i in range(0, len(words), step):
	chunk_words = words[i:i+self.chunk_size]
	chunks.append(" ".join(chunk_words))
	return chunks

	def chunk_file(self, text, filepath):
	"""
	Chunk a file's text and attach metadata to each chunk.
	This metadata will be stored in SQLite alongside the vectors.

	Args:
	text (str) — extracted text content
	filepath (str) — source file path (for metadata)

	Returns:
	list[dict] — each dict contains:
	{
	"text": "the chunk text...",
	"filepath": "/path/to/file.pdf",
	"chunk_index": 0, # position in the file
	"total_chunks": 5 # how many chunks this file produced
	}

	TODO:
	- Call self.chunk_text(text) to get the list of chunk strings
	- Build a list of dicts, one per chunk, with the fields shown above
	- chunk_index starts at 0

	HINT:
	chunks = self.chunk_text(text)
	for i, chunk in enumerate(chunks):
	# build the dict here
	"""
	chunks = self.chunk_text(text)
	results = []
	for i, chunk in enumerate(chunks):
	results.append({
	"text": chunk,
	"filepath": filepath,
	"chunk_index": i,
	})
	return results


	# --- Test it ---
	if __name__ == "__main__":
	chunker = Chunker(chunk_size=10, overlap=3)

	sample = (
	"The quick brown fox jumps over the lazy dog. "
	"Semantic search finds files by meaning not just keywords. "
	"This is a test of the chunking system for our project."
	)

	chunks = chunker.chunk_text(sample)
	print(f"Text has {len(sample.split())} words → {len(chunks)} chunks\n")
	for i, chunk in enumerate(chunks):
	print(f"Chunk {i}: {chunk}")

	print("\n--- With metadata ---")
	results = chunker.chunk_file(sample, "/test/sample.txt")
	for r in results:
	print(f"[{r['chunk_index']}] {r['text'][:60]}...")