Spaces:

joelg
/

discover_rag

Sleeping

App Files Files Community

joelg commited on Oct 8

Commit

e17ceac

1 Parent(s): a8115f1

ADD better chunking visualisation

Browse files

Files changed (1) hide show

rag_system.py +91 -10

rag_system.py CHANGED Viewed

@@ -14,6 +14,7 @@ import spaces
 class RAGSystem:
     def __init__(self):
         self.chunks = []
         self.embeddings = None
         self.index = None
         self.embedding_model = None
@@ -66,10 +67,8 @@ class RAGSystem:
             self.ready = True
-            # Format chunks for display
-            chunks_display = "### Processed Chunks\n\n"
-            for i, chunk in enumerate(self.chunks, 1):
-                chunks_display += f"**Chunk {i}** ({len(chunk)} chars)\n```\n{chunk[:200]}{'...' if len(chunk) > 200 else ''}\n```\n\n"
             status = f"✅ Success! Processed {len(pdf_files)} documents into {len(self.chunks)} chunks."
             return status, chunks_display, corpus_summary
@@ -88,14 +87,17 @@ class RAGSystem:
         return text
     def chunk_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
-        """Split text into overlapping chunks"""
         chunks = []
         start = 0
         text_length = len(text)
         while start < text_length:
             end = start + chunk_size
             chunk = text[start:end]
             # Try to break at sentence boundary
             if end < text_length:
@@ -107,11 +109,33 @@ class RAGSystem:
                 if break_point > chunk_size * 0.5:  # Only break if we're past halfway
                     chunk = chunk[:break_point + 1]
                     end = start + break_point + 1
             chunks.append(chunk.strip())
             start = end - overlap
-        return [c for c in chunks if len(c) > 50]  # Filter out very small chunks
     @spaces.GPU
     def create_embeddings(self, texts: List[str]) -> np.ndarray:
@@ -158,10 +182,8 @@ class RAGSystem:
             self.ready = True
-            # Format chunks for display
-            chunks_display = "### Processed Chunks\n\n"
-            for i, chunk in enumerate(self.chunks, 1):
-                chunks_display += f"**Chunk {i}** ({len(chunk)} chars)\n```\n{chunk}\n```\n\n"
             status = f"✅ Success! Processed {len(self.chunks)} chunks from the document."
             return status, chunks_display, text[:5000]  # Return first 5000 chars of original text
@@ -170,6 +192,65 @@ class RAGSystem:
             self.ready = False
             return f"Error processing document: {str(e)}", "", ""
     def set_embedding_model(self, model_name: str):
         """Set or change the embedding model"""
         if self.embedding_model_name != model_name:

 class RAGSystem:
     def __init__(self):
         self.chunks = []
+        self.chunk_metadata = []  # Store chunk positions for overlap visualization
         self.embeddings = None
         self.index = None
         self.embedding_model = None
             self.ready = True
+            # Format chunks for display with overlap highlighting
+            chunks_display = self._format_chunks_with_overlap()
             status = f"✅ Success! Processed {len(pdf_files)} documents into {len(self.chunks)} chunks."
             return status, chunks_display, corpus_summary
         return text
     def chunk_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
+        """Split text into overlapping chunks and store metadata"""
         chunks = []
+        self.chunk_metadata = []  # Reset metadata
         start = 0
         text_length = len(text)
+        previous_end = 0
         while start < text_length:
             end = start + chunk_size
             chunk = text[start:end]
+            original_end = end
             # Try to break at sentence boundary
             if end < text_length:
                 if break_point > chunk_size * 0.5:  # Only break if we're past halfway
                     chunk = chunk[:break_point + 1]
                     end = start + break_point + 1
+                    original_end = end
+            # Calculate overlap with previous chunk
+            overlap_start = max(0, start - previous_end) if previous_end > 0 else 0
+            overlap_length = min(overlap, previous_end - start) if start < previous_end else 0
             chunks.append(chunk.strip())
+            self.chunk_metadata.append({
+                'start': start,
+                'end': original_end,
+                'overlap_with_previous': overlap_length,
+                'text': chunk
+            })
+            previous_end = original_end
             start = end - overlap
+        # Filter out very small chunks and update metadata accordingly
+        filtered_chunks = []
+        filtered_metadata = []
+        for i, c in enumerate(chunks):
+            if len(c) > 50:
+                filtered_chunks.append(c)
+                filtered_metadata.append(self.chunk_metadata[i])
+        self.chunk_metadata = filtered_metadata
+        return filtered_chunks
     @spaces.GPU
     def create_embeddings(self, texts: List[str]) -> np.ndarray:
             self.ready = True
+            # Format chunks for display with overlap highlighting
+            chunks_display = self._format_chunks_with_overlap()
             status = f"✅ Success! Processed {len(self.chunks)} chunks from the document."
             return status, chunks_display, text[:5000]  # Return first 5000 chars of original text
             self.ready = False
             return f"Error processing document: {str(e)}", "", ""
+    def _format_chunks_with_overlap(self) -> str:
+        """Format chunks with overlap highlighting for pedagogical display"""
+        if not self.chunks or not self.chunk_metadata:
+            return "No chunks available"
+        display = "### 📑 Processed Chunks\n\n"
+        display += "*Overlapping parts are shown separately with a yellow marker (⚠️)*\n\n"
+        display += "---\n\n"
+        for i, (chunk, metadata) in enumerate(zip(self.chunks, self.chunk_metadata), 1):
+            # Calculate which part is overlapping with previous chunk
+            if i == 1:
+                # First chunk has no overlap
+                display += f"#### 📄 Chunk {i}\n"
+                display += f"**{len(chunk)} characters** | 🆕 No overlap (first chunk)\n\n"
+                display += f"```text\n{chunk}\n```\n\n"
+                display += "---\n\n"
+            else:
+                # Find overlap with previous chunk
+                prev_chunk = self.chunks[i-2]
+                # Find common substring at the beginning of current chunk
+                overlap_length = 0
+                for j in range(1, min(len(chunk), len(prev_chunk)) + 1):
+                    if prev_chunk[-j:] == chunk[:j]:
+                        overlap_length = j
+                if overlap_length > 0:
+                    overlap_text = chunk[:overlap_length]
+                    remaining_text = chunk[overlap_length:]
+                    display += f"#### 📄 Chunk {i}\n"
+                    display += f"**{len(chunk)} characters** | ⚠️ **{overlap_length} characters overlap** with previous chunk\n\n"
+                    # Show overlap
+                    display += f"> **⚠️ OVERLAP ({overlap_length} chars) - Repeated from Chunk {i-1}:**\n"
+                    display += f"> ```text\n"
+                    for line in overlap_text.split('\n'):
+                        display += f"> {line}\n"
+                    display += f"> ```\n\n"
+                    # Show the new content
+                    display += f"**🆕 NEW CONTENT ({len(remaining_text)} chars):**\n"
+                    display += f"```text\n{remaining_text}\n```\n\n"
+                    # Show full chunk for reference
+                    display += f"<details>\n<summary>📋 Click to view complete chunk (overlap + new)</summary>\n\n"
+                    display += f"```text\n{chunk}\n```\n\n"
+                    display += f"</details>\n\n"
+                else:
+                    # No overlap found (shouldn't happen normally)
+                    display += f"#### 📄 Chunk {i}\n"
+                    display += f"**{len(chunk)} characters** | No overlap detected\n\n"
+                    display += f"```text\n{chunk}\n```\n\n"
+                display += "---\n\n"
+        return display
     def set_embedding_model(self, model_name: str):
         """Set or change the embedding model"""
         if self.embedding_model_name != model_name: