# indexer/chunker.py


class Chunker:
    """
    Splits extracted text into overlapping chunks using a sliding window.
    Each chunk will later be embedded as a separate vector.
    
    Why chunk at all?
    - Embedding models have a token limit (typically 256-512 tokens)
    - A 50-page PDF as one embedding would lose detail
    - Small chunks let us pinpoint the EXACT passage that matches a query
    
    Why overlap?
    - A sentence at the boundary might get cut in half
    - Overlap ensures every sentence appears fully in at least one chunk
    """

    def __init__(self, chunk_size=500, overlap=50):
        """
        Args:
            chunk_size (int) — max number of words per chunk
            overlap (int) — number of words shared between consecutive chunks

        TODO:
        - Store chunk_size and overlap as instance variables
        - Validate that overlap is less than chunk_size
            (if overlap >= chunk_size, chunks would never advance forward)
        """
        self.chunk_size = chunk_size
        self.overlap = overlap
        if self.overlap >= self.chunk_size:
            raise ValueError("Overlap must be smaller than chunk_size")

    def chunk_text(self, text):
        """
        Split a text string into overlapping chunks based on word count.

        Args:
            text (str) — the full extracted text from a file

        Returns:
            list[str] — list of text chunks

        Example with chunk_size=5, overlap=2:
            text = "The quick brown fox jumps over the lazy dog today"
            words = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "today"]

            Chunk 0: words[0:5]  → "The quick brown fox jumps"
            Chunk 1: words[3:8]  → "fox jumps over the lazy"      (step = 5-2 = 3)
            Chunk 2: words[6:11] → "the lazy dog today"           (step = 3 again)

        TODO:
        - Split the text into a list of words using .split()
        - If the word list is empty, return an empty list
        - Calculate step size: step = chunk_size - overlap
        - Use a loop starting at 0, stepping by 'step', up to len(words)
        - At each position, take words[i : i + chunk_size]
        - Join each slice back into a string with " ".join()
        - Return the list of chunk strings

        HINT:
            words = text.split()
            step = self.chunk_size - self.overlap
            for i in range(0, len(words), step):
                chunk_words = words[i : i + self.chunk_size]
        """
        words = text.split()
        if not words:
            return []
        step = self.chunk_size - self.overlap
        chunks = []
        for i in range(0, len(words), step):
            chunk_words = words[i:i+self.chunk_size]
            chunks.append(" ".join(chunk_words))
        return chunks

    def chunk_file(self, text, filepath):
        """
        Chunk a file's text and attach metadata to each chunk.
        This metadata will be stored in SQLite alongside the vectors.

        Args:
            text (str) — extracted text content
            filepath (str) — source file path (for metadata)

        Returns:
            list[dict] — each dict contains:
                {
                    "text": "the chunk text...",
                    "filepath": "/path/to/file.pdf",
                    "chunk_index": 0,     # position in the file
                    "total_chunks": 5     # how many chunks this file produced
                }

        TODO:
        - Call self.chunk_text(text) to get the list of chunk strings
        - Build a list of dicts, one per chunk, with the fields shown above
        - chunk_index starts at 0

        HINT:
            chunks = self.chunk_text(text)
            for i, chunk in enumerate(chunks):
                # build the dict here
        """
        chunks = self.chunk_text(text)
        results = []
        for i, chunk in enumerate(chunks):
            results.append({
                "text": chunk,
                "filepath": filepath,
                "chunk_index": i,
            })
        return results


# --- Test it ---
if __name__ == "__main__":
    chunker = Chunker(chunk_size=10, overlap=3)

    sample = (
        "The quick brown fox jumps over the lazy dog. "
        "Semantic search finds files by meaning not just keywords. "
        "This is a test of the chunking system for our project."
    )

    chunks = chunker.chunk_text(sample)
    print(f"Text has {len(sample.split())} words → {len(chunks)} chunks\n")
    for i, chunk in enumerate(chunks):
        print(f"Chunk {i}: {chunk}")

    print("\n--- With metadata ---")
    results = chunker.chunk_file(sample, "/test/sample.txt")
    for r in results:
        print(f"[{r['chunk_index']}] {r['text'][:60]}...")