Spaces:

LvMAC
/

rag-chunk-visualizer

Runtime error

App Files Files Community

LvMAC commited on Sep 5, 2025

Commit

ae42893

verified ·

1 Parent(s): 45ff068

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +202 -242

src/streamlit_app.py CHANGED Viewed

@@ -8,15 +8,11 @@ import pandas as pd
 import io
 import time
 from typing import List, Dict, Any
-import PyPDF2
-import openpyxl
-from docx import Document
-import csv
 # Safe model loading without cache permission issues
 @st.cache_resource
 def load_sentence_transformer():
-    st.info("⚠️ Semantic chunking disabled in HuggingFace environment")
     return None
 @st.cache_resource
@@ -29,12 +25,12 @@ def load_nltk():
             try:
                 nltk.download('punkt', quiet=True)
             except:
-                pass  # Skip if download fails
         return nltk
     except ImportError:
         return None
-class ProductionChunkVisualizer:
     def __init__(self):
         self.colors = [
             '#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57',
@@ -54,7 +50,7 @@ class ProductionChunkVisualizer:
     def extract_text_from_pdf(self, pdf_file):
         """Extract text from PDF file"""
         try:
-            # Reset file pointer to beginning
             pdf_file.seek(0)
             pdf_reader = PyPDF2.PdfReader(pdf_file)
             text = ""
@@ -70,21 +66,19 @@ class ProductionChunkVisualizer:
                     st.warning(f"Could not extract text from page {page_num + 1}: {str(e)}")
             if not text.strip():
-                st.warning("PDF appears to be image-based or empty. No text extracted.")
                 return "No extractable text found in PDF document."
             return text.strip()
         except Exception as e:
             st.error(f"Error reading PDF: {str(e)}")
-            return f"PDF processing error: {str(e)}"
     def extract_text_from_excel(self, excel_file):
         """Extract text from Excel file"""
         try:
-            # Reset file pointer to beginning
             excel_file.seek(0)
-            # Try different engines
             try:
                 xl_data = pd.read_excel(excel_file, sheet_name=None, engine='openpyxl')
             except:
@@ -101,13 +95,11 @@ class ProductionChunkVisualizer:
                 text += f"\n=== Sheet: {sheet_name} ===\n"
                 if not df.empty:
-                    # Add column headers
                     headers = " | ".join(str(col) for col in df.columns)
                     text += f"Headers: {headers}\n"
                     text += "-" * 50 + "\n"
-                    # Add data rows (limit to prevent massive output)
-                    max_rows = min(100, len(df))  # Limit to 100 rows per sheet
                     for idx, row in df.head(max_rows).iterrows():
                         row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
                         text += row_text + "\n"
@@ -116,21 +108,18 @@ class ProductionChunkVisualizer:
                         text += f"... ({len(df) - max_rows} more rows)\n"
                 else:
                     text += "Empty sheet\n"
                 text += "\n"
             return text.strip()
         except Exception as e:
             st.error(f"Error reading Excel file: {str(e)}")
-            return f"Excel processing error: {str(e)}"
     def extract_text_from_csv(self, csv_file):
         """Extract text from CSV file"""
         try:
-            # Reset file pointer to beginning
             csv_file.seek(0)
-            # Try different encodings
             for encoding in ['utf-8', 'latin-1', 'cp1252']:
                 try:
                     csv_file.seek(0)
@@ -139,20 +128,18 @@ class ProductionChunkVisualizer:
                 except UnicodeDecodeError:
                     continue
             else:
-                df = pd.read_csv(csv_file)  # Default encoding
             if df.empty:
                 return "Empty CSV file"
             st.write(f"📋 Processing CSV with {len(df)} rows and {len(df.columns)} columns...")
-            # Create readable text format
             text = "=== CSV Data ===\n"
             headers = " | ".join(str(col) for col in df.columns)
             text += f"Headers: {headers}\n"
             text += "-" * 50 + "\n"
-            # Limit rows to prevent massive output
             max_rows = min(100, len(df))
             for _, row in df.head(max_rows).iterrows():
                 row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
@@ -164,11 +151,13 @@ class ProductionChunkVisualizer:
             return text.strip()
         except Exception as e:
             st.error(f"Error reading CSV file: {str(e)}")
-            return f"CSV processing error: {str(e)}"
     def extract_text_from_docx(self, docx_file):
         """Extract text from Word document"""
         try:
             doc = Document(docx_file)
             text = ""
@@ -176,7 +165,6 @@ class ProductionChunkVisualizer:
                 if paragraph.text.strip():
                     text += paragraph.text + "\n"
-            # Also extract text from tables
             for table in doc.tables:
                 text += "\n=== Table ===\n"
                 for row in table.rows:
@@ -215,7 +203,6 @@ class ProductionChunkVisualizer:
                 chunk = text[start:]
             else:
                 chunk = text[start:end]
-                # Find last complete word
                 if not text[end].isspace():
                     last_space = chunk.rfind(' ')
                     if last_space > chunk_size * 0.7:
@@ -248,7 +235,6 @@ class ProductionChunkVisualizer:
             chunk_sentences = sentences[i:i + sentences_per_chunk]
             chunk_text = ' '.join(chunk_sentences)
-            # Find actual position in original text
             start_pos = text.find(chunk_sentences[0], current_pos)
             if start_pos == -1:
                 start_pos = current_pos
@@ -297,11 +283,6 @@ class ProductionChunkVisualizer:
         return chunks
-    def semantic_chunking(self, text: str, similarity_threshold: float = 0.5) -> List[Dict]:
-        """Disabled semantic chunking - fallback to sentence-based"""
-        st.warning("Semantic chunking unavailable in this environment. Using sentence-based fallback.")
-        return self.sentence_chunking(text, 3)
     def recursive_chunking(self, text: str, max_chunk_size: int = 1000) -> List[Dict]:
         """Hierarchical text splitting with multiple separators"""
         separators = ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " "]
@@ -368,7 +349,7 @@ class ProductionChunkVisualizer:
         return chunks
-    def calculate_advanced_metrics(self, chunks: List[Dict]) -> Dict[str, Any]:
         """Calculate comprehensive chunk metrics"""
         if not chunks:
             return {}
@@ -384,7 +365,6 @@ class ProductionChunkVisualizer:
                 overlap_ratio = max(0, (total_chars - text_length) / text_length)
         char_cv = np.std(char_counts) / np.mean(char_counts) if np.mean(char_counts) > 0 else 0
-        word_cv = np.std(word_counts) / np.mean(word_counts) if np.mean(word_counts) > 0 else 0
         return {
             'total_chunks': len(chunks),
@@ -395,25 +375,22 @@ class ProductionChunkVisualizer:
             'avg_words': np.mean(word_counts),
             'std_words': np.std(word_counts),
             'char_cv': char_cv,
-            'word_cv': word_cv,
             'overlap_ratio': overlap_ratio,
             'size_consistency': 1 - char_cv,
             'total_coverage': sum(chunk['end'] - chunk['start'] for chunk in chunks)
         }
-    def visualize_chunks_advanced(self, text: str, chunks: List[Dict]):
-        """Advanced chunk visualization"""
         if not chunks:
             st.write("No chunks to display")
             return
-        st.markdown("### 🎨 Interactive Chunk Visualization")
         for i, chunk in enumerate(chunks):
             color = self.colors[i % len(self.colors)]
-            words_per_sentence = chunk['word_count'] / max(1, chunk.get('sentence_count', 1))
             st.markdown(f"""
             <div style='background: linear-gradient(135deg, {color}15, {color}25);
                         border-left: 5px solid {color};
@@ -432,13 +409,10 @@ class ProductionChunkVisualizer:
                 <div style='color: #333; line-height: 1.6; font-size: 14px;'>
                     {chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}
                 </div>
-                <div style='margin-top: 8px; color: #888; font-size: 11px;'>
-                    Quality: {words_per_sentence:.1f} words/sentence
-                </div>
             </div>
             """, unsafe_allow_html=True)
-    def create_comprehensive_charts(self, all_results: Dict[str, List[Dict]]):
         """Create detailed analysis charts"""
         if not all_results:
             return
@@ -447,7 +421,7 @@ class ProductionChunkVisualizer:
         size_data = []
         for method, chunks in all_results.items():
-            metrics = self.calculate_advanced_metrics(chunks)
             metrics_data.append({
                 'Method': method,
                 'Chunks': metrics.get('total_chunks', 0),
@@ -518,16 +492,17 @@ class ProductionChunkVisualizer:
 def main():
     st.set_page_config(
-        page_title="Multi-Format RAG Chunk Visualizer",
         page_icon="🔍",
         layout="wide",
         initial_sidebar_state="expanded"
     )
     col1, col2 = st.columns([3, 1])
     with col1:
-        st.title("🔍 Multi-Format RAG Chunk Visualizer")
-        st.markdown("**Professional chunking analysis with support for PDF, Excel, CSV, Word & Text files**")
     with col2:
         if st.button("ℹ️ About", help="Learn about chunking strategies"):
@@ -537,122 +512,146 @@ def main():
                 **Sentence-based**: Groups sentences together for semantic coherence
                 **Paragraph-based**: Respects document structure and topic boundaries
                 **Recursive**: Hierarchical splitting using multiple separators
-                *Note: Semantic chunking disabled in this environment*
                 """)
-    visualizer = ProductionChunkVisualizer()
     with st.sidebar:
         st.header("⚙️ Configuration")
         input_method = st.radio(
             "Choose input method:",
-            ["📝 Sample Text", "📁 Upload File", "✏️ Custom Input"],
             help="Select how you want to provide text for analysis"
         )
-        sample_texts = {
-            "Research Paper Abstract": """Machine learning has fundamentally transformed the landscape of artificial intelligence research. Recent advances in deep learning architectures, particularly transformer-based models, have demonstrated unprecedented capabilities in natural language understanding and generation. These models leverage attention mechanisms to capture long-range dependencies in sequential data, enabling more sophisticated reasoning and contextual understanding. The implications extend beyond traditional NLP tasks to multimodal applications, including vision-language models and cross-modal reasoning systems. However, significant challenges remain in terms of computational efficiency, interpretability, and robustness to adversarial inputs.""",
-            "Technical Documentation": """Installation Prerequisites: Before beginning the installation process, ensure your system meets the following requirements. Python 3.8 or higher must be installed with pip package manager available. Node.js version 16.x or later is required for frontend dependencies. Git version control system should be accessible from command line.\n\nStep 1: Repository Setup\nClone the project repository using the following command: git clone https://github.com/company/rag-system.git. Navigate to the project directory and create a virtual environment: python -m venv rag-env. Activate the virtual environment using the appropriate command for your operating system.\n\nStep 2: Dependency Installation\nInstall Python dependencies by running pip install -r requirements.txt. This will install all necessary packages including transformers, sentence-transformers, and streamlit. For development dependencies, additionally run pip install -r requirements-dev.txt.""",
-            "Business Report": """Executive Summary: Q4 2024 Performance Analysis\n\nOur organization achieved exceptional growth in the fourth quarter of 2024, with revenue increasing by 42% year-over-year to reach $3.8 million. This growth was primarily driven by our expanded product portfolio and successful market penetration strategies in the enterprise segment.\n\nKey Performance Indicators demonstrate strong momentum across all business units. Customer acquisition costs decreased by 18% while customer lifetime value increased by 35%, indicating improved operational efficiency and customer satisfaction. Our newly launched AI-powered features contributed significantly to user engagement, with daily active users increasing by 67%.\n\nStrategic Initiatives for 2025 focus on international expansion and technology innovation. We plan to establish operations in three new markets: Germany, Japan, and Australia. Additionally, our R&D investment will increase by 50% to accelerate development of next-generation AI capabilities."""
-        }
-        if input_method == "📝 Sample Text":
-            selected_sample = st.selectbox("Select sample text:", list(sample_texts.keys()))
-            text = sample_texts[selected_sample]
-            st.text_area("Preview:", value=text[:200] + "...", height=100, disabled=True)
-        elif input_method == "📁 Upload File":
             uploaded_file = st.file_uploader(
-                "Upload document",
                 type=['txt', 'pdf', 'csv', 'xlsx', 'xls', 'docx'],
                 help="Supports: TXT, PDF, CSV, Excel (XLSX/XLS), Word (DOCX)"
             )
-            if uploaded_file:
-                file_type = uploaded_file.type
                 with st.spinner(f"Processing {uploaded_file.name}..."):
-                    if file_type == "text/plain":
-                        text = str(uploaded_file.read(), "utf-8")
-                    elif file_type == "application/pdf":
-                        text = visualizer.extract_text_from_pdf(uploaded_file)
-                    elif file_type == "text/csv":
-                        text = visualizer.extract_text_from_csv(uploaded_file)
-                    elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-                                       "application/vnd.ms-excel"]:
-                        text = visualizer.extract_text_from_excel(uploaded_file)
-                    elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
-                        text = visualizer.extract_text_from_docx(uploaded_file)
-                    else:
-                        st.error(f"Unsupported file type: {file_type}")
-                        text = sample_texts["Research Paper Abstract"]
                 if text and len(text.strip()) > 0:
-                    st.success(f"✅ Extracted {len(text)} characters from {uploaded_file.name}")
-                    if len(text) > 1000:
-                        st.text_area("Preview:", value=text[:500] + "...", height=100, disabled=True)
                 else:
-                    st.error("No text could be extracted from the file")
-                    text = sample_texts["Research Paper Abstract"]
             else:
-                text = sample_texts["Research Paper Abstract"]
-                st.info("Using sample text until file is uploaded")
-        else:
             text = st.text_area(
                 "Enter your text:",
                 height=200,
-                value=sample_texts["Business Document"],
                 help="Paste or type the text you want to analyze"
             )
-        st.divider()
-        st.subheader("🔧 Chunking Methods")
-        method_options = {
-            'Fixed Size': 'Character-based splitting with word boundaries',
-            'Sentence-based': 'Group by sentences for readability',
-            'Paragraph-based': 'Respect document structure',
-            'Recursive': 'Hierarchical splitting with multiple separators'
-        }
-        selected_methods = []
-        for method, description in method_options.items():
-            if st.checkbox(method, value=method in ['Fixed Size', 'Sentence-based'], help=description):
-                selected_methods.append(method)
-        if not selected_methods:
-            st.warning("⚠️ Select at least one chunking method")
-        st.divider()
-        st.subheader("⚙️ Parameters")
-        params = {}
-        if 'Fixed Size' in selected_methods:
-            st.markdown("**Fixed Size Settings**")
-            params['chunk_size'] = st.slider("Chunk size (characters)", 200, 2000, 800, step=50)
-            params['overlap'] = st.slider("Overlap (characters)", 0, 300, 100, step=25)
-        if 'Sentence-based' in selected_methods:
-            st.markdown("**Sentence-based Settings**")
-            params['sentences_per_chunk'] = st.slider("Sentences per chunk", 1, 10, 4)
-        if 'Recursive' in selected_methods:
-            st.markdown("**Recursive Settings**")
-            params['max_recursive_size'] = st.slider("Max chunk size", 500, 2000, 1200, step=100)
-        with st.expander("🔬 Advanced Options"):
-            show_overlap_analysis = st.checkbox("Show overlap analysis", value=True)
-            show_quality_metrics = st.checkbox("Show quality metrics", value=True)
-            export_results = st.checkbox("Enable result export", value=False)
-    if text and selected_methods:
         with st.spinner("Processing chunks..."):
             all_results = {}
@@ -674,14 +673,17 @@ def main():
                 all_results[method] = chunks
-        st.success(f"✅ Processed {len(text)} characters with {len(selected_methods)} methods")
         tabs = st.tabs([f"📊 {method}" for method in selected_methods] + ["📈 Comparison"])
         for i, (method, chunks) in enumerate(all_results.items()):
             with tabs[i]:
-                metrics = visualizer.calculate_advanced_metrics(chunks)
                 col1, col2, col3, col4, col5 = st.columns(5)
                 with col1:
                     st.metric("Total Chunks", metrics.get('total_chunks', 0))
@@ -695,8 +697,10 @@ def main():
                     overlap_pct = metrics.get('overlap_ratio', 0) * 100
                     st.metric("Overlap", f"{overlap_pct:.1f}%")
-                visualizer.visualize_chunks_advanced(text, chunks)
                 if len(chunks) > 1:
                     sizes = [chunk['char_count'] for chunk in chunks]
                     fig = px.histogram(
@@ -705,18 +709,21 @@ def main():
                         labels={'x': 'Characters', 'y': 'Count'}
                     )
                     fig.update_layout(height=300)
-                    st.plotly_chart(fig, use_container_width=True)
         with tabs[-1]:
             st.header("📈 Comprehensive Analysis")
-            visualizer.create_comprehensive_charts(all_results)
             st.subheader("📊 Detailed Metrics Comparison")
             comparison_data = []
             for method, chunks in all_results.items():
-                metrics = visualizer.calculate_advanced_metrics(chunks)
                 comparison_data.append({
                     'Method': method,
                     'Chunks': metrics.get('total_chunks', 0),
@@ -727,28 +734,30 @@ def main():
                 })
             df_comparison = pd.DataFrame(comparison_data)
-            st.dataframe(df_comparison, use_container_width=True)
-            st.subheader("🤖 Intelligent Recommendations")
             best_consistency = max(all_results.keys(),
-                                 key=lambda m: visualizer.calculate_advanced_metrics(all_results[m]).get('size_consistency', 0))
             optimal_size_method = min(all_results.keys(),
-                                    key=lambda m: abs(visualizer.calculate_advanced_metrics(all_results[m]).get('avg_chars', 1000) - 600))
             col1, col2 = st.columns(2)
             with col1:
                 st.success(f"🎯 **Most Consistent**: {best_consistency}")
-                consistency_score = visualizer.calculate_advanced_metrics(all_results[best_consistency]).get('size_consistency', 0)
                 st.write(f"Consistency score: {consistency_score:.3f}")
             with col2:
                 st.info(f"⚖️ **Optimal Size**: {optimal_size_method}")
-                avg_size = visualizer.calculate_advanced_metrics(all_results[optimal_size_method]).get('avg_chars', 0)
                 st.write(f"Average size: {avg_size:.0f} characters")
             st.markdown("### 💡 Use Case Recommendations")
             recommendations = {
@@ -761,133 +770,84 @@ def main():
             for use_case, recommendation in recommendations.items():
                 st.markdown(f"- {use_case}: {recommendation}")
-            if export_results:
-                st.subheader("📤 Export Results")
-                report_data = {
-                    'text_length': len(text),
-                    'methods_used': list(all_results.keys()),
-                    'parameters': params,
-                    'results': {}
-                }
-                for method, chunks in all_results.items():
-                    metrics = visualizer.calculate_advanced_metrics(chunks)
-                    report_data['results'][method] = {
-                        'chunks': len(chunks),
-                        'metrics': metrics,
-                        'chunk_details': chunks
-                    }
-                import json
-                report_json = json.dumps(report_data, indent=2, default=str)
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.download_button(
-                        "📋 Download Analysis Report (JSON)",
-                        data=report_json,
-                        file_name=f"chunk_analysis_{len(text)}_chars.json",
-                        mime="application/json"
-                    )
-                with col2:
-                    markdown_report = f"""# Multi-Format Chunk Analysis Report
-## Text Analysis
-- **Length**: {len(text):,} characters
-- **Methods**: {', '.join(all_results.keys())}
-- **Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}
-## Results Summary
-"""
-                    for method, chunks in all_results.items():
-                        metrics = visualizer.calculate_advanced_metrics(chunks)
-                        markdown_report += f"""
-### {method} Method
-- **Chunks**: {metrics.get('total_chunks', 0)}
-- **Average Size**: {metrics.get('avg_chars', 0):.0f} characters
-- **Consistency**: {metrics.get('size_consistency', 0):.3f}
-- **Overlap**: {metrics.get('overlap_ratio', 0)*100:.1f}%
-"""
-                    st.download_button(
-                        "📄 Download Summary (Markdown)",
-                        data=markdown_report,
-                        file_name=f"chunk_summary_{len(text)}_chars.md",
-                        mime="text/markdown"
-                    )
     else:
         st.markdown("""
-        ## 👋 Welcome to the Multi-Format RAG Chunk Visualizer
         This tool analyzes how different chunking strategies split your documents for RAG systems.
-        ### 🚀 Supported File Formats
-        - **📄 PDF**: Research papers, reports, documentation
-        - **📊 Excel (XLSX/XLS)**: Spreadsheets, data tables, financial reports
-        - **📋 CSV**: Data exports, logs, structured datasets
-        - **📝 Word (DOCX)**: Business documents, proposals, manuscripts
-        - **📜 Text (TXT)**: Plain text files, code, notes
-        ### 🎯 Key Features
-        - **4 chunking strategies** with real-time comparison
-        - **Advanced metrics** including consistency and overlap analysis
-        - **Interactive visualizations** with detailed chunk inspection
-        - **Export capabilities** for team collaboration
-        - **Professional recommendations** for different use cases
-        ### 💡 Quick Start
-        1. **Upload your file** or use sample text
-        2. **Select chunking methods** to compare (2-3 recommended)
-        3. **Adjust parameters** for each method
-        4. **Analyze results** with comprehensive metrics
-        ### 🔧 Chunking Methods Available
         - **Fixed Size**: Consistent character-based chunks with word boundaries
-        - **Sentence-based**: Natural language flow with sentence grouping
         - **Paragraph-based**: Document structure preservation
         - **Recursive**: Hierarchical splitting with multiple separators
-        **Note**: Semantic chunking temporarily disabled in this environment
-        Select your settings in the sidebar to begin analysis! 👈
         """)
-        # Sample file format examples
-        st.subheader("📁 Example Use Cases")
         col1, col2, col3 = st.columns(3)
         with col1:
             st.markdown("""
-            **📄 PDF Files**
-            - Research papers
-            - Technical manuals
-            - Legal documents
-            - Reports and presentations
             """)
         with col2:
             st.markdown("""
-            **📊 Excel/CSV Files**
-            - Data tables
-            - Survey results
-            - Financial reports
-            - Product catalogs
             """)
         with col3:
             st.markdown("""
-            **📝 Text/Word Files**
-            - Articles and blogs
-            - Meeting notes
-            - Technical documentation
-            - Business proposals
             """)
 if __name__ == "__main__":

 import io
 import time
 from typing import List, Dict, Any
 # Safe model loading without cache permission issues
 @st.cache_resource
 def load_sentence_transformer():
+    st.info("⚠️ Semantic chunking disabled in this environment")
     return None
 @st.cache_resource
             try:
                 nltk.download('punkt', quiet=True)
             except:
+                pass
         return nltk
     except ImportError:
         return None
+class ChunkVisualizer:
     def __init__(self):
         self.colors = [
             '#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57',
     def extract_text_from_pdf(self, pdf_file):
         """Extract text from PDF file"""
         try:
+            import PyPDF2
             pdf_file.seek(0)
             pdf_reader = PyPDF2.PdfReader(pdf_file)
             text = ""
                     st.warning(f"Could not extract text from page {page_num + 1}: {str(e)}")
             if not text.strip():
+                st.warning("PDF appears to be image-based or empty.")
                 return "No extractable text found in PDF document."
             return text.strip()
         except Exception as e:
             st.error(f"Error reading PDF: {str(e)}")
+            return ""
     def extract_text_from_excel(self, excel_file):
         """Extract text from Excel file"""
         try:
             excel_file.seek(0)
             try:
                 xl_data = pd.read_excel(excel_file, sheet_name=None, engine='openpyxl')
             except:
                 text += f"\n=== Sheet: {sheet_name} ===\n"
                 if not df.empty:
                     headers = " | ".join(str(col) for col in df.columns)
                     text += f"Headers: {headers}\n"
                     text += "-" * 50 + "\n"
+                    max_rows = min(100, len(df))
                     for idx, row in df.head(max_rows).iterrows():
                         row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
                         text += row_text + "\n"
                         text += f"... ({len(df) - max_rows} more rows)\n"
                 else:
                     text += "Empty sheet\n"
                 text += "\n"
             return text.strip()
         except Exception as e:
             st.error(f"Error reading Excel file: {str(e)}")
+            return ""
     def extract_text_from_csv(self, csv_file):
         """Extract text from CSV file"""
         try:
             csv_file.seek(0)
             for encoding in ['utf-8', 'latin-1', 'cp1252']:
                 try:
                     csv_file.seek(0)
                 except UnicodeDecodeError:
                     continue
             else:
+                df = pd.read_csv(csv_file)
             if df.empty:
                 return "Empty CSV file"
             st.write(f"📋 Processing CSV with {len(df)} rows and {len(df.columns)} columns...")
             text = "=== CSV Data ===\n"
             headers = " | ".join(str(col) for col in df.columns)
             text += f"Headers: {headers}\n"
             text += "-" * 50 + "\n"
             max_rows = min(100, len(df))
             for _, row in df.head(max_rows).iterrows():
                 row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
             return text.strip()
         except Exception as e:
             st.error(f"Error reading CSV file: {str(e)}")
+            return ""
     def extract_text_from_docx(self, docx_file):
         """Extract text from Word document"""
         try:
+            from docx import Document
+            docx_file.seek(0)
             doc = Document(docx_file)
             text = ""
                 if paragraph.text.strip():
                     text += paragraph.text + "\n"
             for table in doc.tables:
                 text += "\n=== Table ===\n"
                 for row in table.rows:
                 chunk = text[start:]
             else:
                 chunk = text[start:end]
                 if not text[end].isspace():
                     last_space = chunk.rfind(' ')
                     if last_space > chunk_size * 0.7:
             chunk_sentences = sentences[i:i + sentences_per_chunk]
             chunk_text = ' '.join(chunk_sentences)
             start_pos = text.find(chunk_sentences[0], current_pos)
             if start_pos == -1:
                 start_pos = current_pos
         return chunks
     def recursive_chunking(self, text: str, max_chunk_size: int = 1000) -> List[Dict]:
         """Hierarchical text splitting with multiple separators"""
         separators = ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " "]
         return chunks
+    def calculate_metrics(self, chunks: List[Dict]) -> Dict[str, Any]:
         """Calculate comprehensive chunk metrics"""
         if not chunks:
             return {}
                 overlap_ratio = max(0, (total_chars - text_length) / text_length)
         char_cv = np.std(char_counts) / np.mean(char_counts) if np.mean(char_counts) > 0 else 0
         return {
             'total_chunks': len(chunks),
             'avg_words': np.mean(word_counts),
             'std_words': np.std(word_counts),
             'char_cv': char_cv,
             'overlap_ratio': overlap_ratio,
             'size_consistency': 1 - char_cv,
             'total_coverage': sum(chunk['end'] - chunk['start'] for chunk in chunks)
         }
+    def visualize_chunks(self, chunks: List[Dict]):
+        """Display chunks with color coding"""
         if not chunks:
             st.write("No chunks to display")
             return
+        st.markdown("### 🎨 Chunk Visualization")
         for i, chunk in enumerate(chunks):
             color = self.colors[i % len(self.colors)]
             st.markdown(f"""
             <div style='background: linear-gradient(135deg, {color}15, {color}25);
                         border-left: 5px solid {color};
                 <div style='color: #333; line-height: 1.6; font-size: 14px;'>
                     {chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}
                 </div>
             </div>
             """, unsafe_allow_html=True)
+    def create_comparison_charts(self, all_results: Dict[str, List[Dict]]):
         """Create detailed analysis charts"""
         if not all_results:
             return
         size_data = []
         for method, chunks in all_results.items():
+            metrics = self.calculate_metrics(chunks)
             metrics_data.append({
                 'Method': method,
                 'Chunks': metrics.get('total_chunks', 0),
 def main():
     st.set_page_config(
+        page_title="RAG Chunk Visualizer",
         page_icon="🔍",
         layout="wide",
         initial_sidebar_state="expanded"
     )
+    # Header
     col1, col2 = st.columns([3, 1])
     with col1:
+        st.title("🔍 RAG Chunk Visualizer")
+        st.markdown("**Professional chunking analysis for RAG systems**")
     with col2:
         if st.button("ℹ️ About", help="Learn about chunking strategies"):
                 **Sentence-based**: Groups sentences together for semantic coherence
                 **Paragraph-based**: Respects document structure and topic boundaries
                 **Recursive**: Hierarchical splitting using multiple separators
                 """)
+    visualizer = ChunkVisualizer()
+    # Sidebar for configuration
     with st.sidebar:
         st.header("⚙️ Configuration")
+        # Input method selection
         input_method = st.radio(
             "Choose input method:",
+            ["📁 Upload File", "✏️ Custom Input"],
             help="Select how you want to provide text for analysis"
         )
+        # File upload or text input
+        text = ""
+        if input_method == "📁 Upload File":
+            st.markdown("**File Upload**")
             uploaded_file = st.file_uploader(
+                "Choose a file",
                 type=['txt', 'pdf', 'csv', 'xlsx', 'xls', 'docx'],
                 help="Supports: TXT, PDF, CSV, Excel (XLSX/XLS), Word (DOCX)"
             )
+            if uploaded_file is not None:
+                st.success(f"📁 File loaded: **{uploaded_file.name}**")
+                # Show file info
+                with st.expander("File Details", expanded=False):
+                    st.write(f"**Name:** {uploaded_file.name}")
+                    st.write(f"**Size:** {len(uploaded_file.getvalue()):,} bytes")
+                    st.write(f"**Type:** {uploaded_file.type}")
+                # Process the file
+                file_name = uploaded_file.name.lower()
                 with st.spinner(f"Processing {uploaded_file.name}..."):
+                    try:
+                        if file_name.endswith('.txt'):
+                            uploaded_file.seek(0)
+                            text = str(uploaded_file.read(), "utf-8")
+                        elif file_name.endswith('.pdf'):
+                            text = visualizer.extract_text_from_pdf(uploaded_file)
+                        elif file_name.endswith('.csv'):
+                            text = visualizer.extract_text_from_csv(uploaded_file)
+                        elif file_name.endswith(('.xlsx', '.xls')):
+                            text = visualizer.extract_text_from_excel(uploaded_file)
+                        elif file_name.endswith('.docx'):
+                            text = visualizer.extract_text_from_docx(uploaded_file)
+                        else:
+                            st.warning("Unsupported file type - trying as text...")
+                            uploaded_file.seek(0)
+                            text = str(uploaded_file.read(), "utf-8")
+                    except Exception as e:
+                        st.error(f"Error processing file: {str(e)}")
+                        text = ""
+                # Show processing results
                 if text and len(text.strip()) > 0:
+                    st.success(f"✅ Extracted {len(text):,} characters")
+                    # Show preview
+                    preview_text = text[:300] + "..." if len(text) > 300 else text
+                    st.text_area(
+                        "Content Preview:",
+                        value=preview_text,
+                        height=100,
+                        disabled=True,
+                        help="First 300 characters of extracted text"
+                    )
                 else:
+                    st.error("❌ No text could be extracted from the file")
             else:
+                st.info("👆 Choose a file to upload")
+        else:  # Custom Input
             text = st.text_area(
                 "Enter your text:",
                 height=200,
+                placeholder="Paste or type your text here to analyze different chunking strategies...",
                 help="Paste or type the text you want to analyze"
             )
+        # Only show chunking options if we have text
+        if text and len(text.strip()) > 0:
+            st.divider()
+            # Method selection
+            st.subheader("🔧 Chunking Methods")
+            method_options = {
+                'Fixed Size': 'Character-based splitting with word boundaries',
+                'Sentence-based': 'Group by sentences for readability',
+                'Paragraph-based': 'Respect document structure',
+                'Recursive': 'Hierarchical splitting with multiple separators'
+            }
+            selected_methods = []
+            for method, description in method_options.items():
+                if st.checkbox(method, value=method in ['Fixed Size', 'Sentence-based'], help=description):
+                    selected_methods.append(method)
+            if not selected_methods:
+                st.warning("⚠️ Select at least one chunking method")
+            st.divider()
+            # Parameters
+            st.subheader("⚙️ Parameters")
+            params = {}
+            if 'Fixed Size' in selected_methods:
+                st.markdown("**Fixed Size Settings**")
+                params['chunk_size'] = st.slider("Chunk size (characters)", 200, 2000, 800, step=50)
+                params['overlap'] = st.slider("Overlap (characters)", 0, 300, 100, step=25)
+            if 'Sentence-based' in selected_methods:
+                st.markdown("**Sentence-based Settings**")
+                params['sentences_per_chunk'] = st.slider("Sentences per chunk", 1, 10, 4)
+            if 'Recursive' in selected_methods:
+                st.markdown("**Recursive Settings**")
+                params['max_recursive_size'] = st.slider("Max chunk size", 500, 2000, 1200, step=100)
+        else:
+            selected_methods = []
+            params = {}
+    # Main content area
+    if text and len(text.strip()) > 0 and selected_methods:
+        # Process text with selected methods
         with st.spinner("Processing chunks..."):
             all_results = {}
                 all_results[method] = chunks
+        st.success(f"✅ Processed {len(text):,} characters with {len(selected_methods)} methods")
+        # Display results in tabs
         tabs = st.tabs([f"📊 {method}" for method in selected_methods] + ["📈 Comparison"])
+        # Individual method tabs
         for i, (method, chunks) in enumerate(all_results.items()):
             with tabs[i]:
+                metrics = visualizer.calculate_metrics(chunks)
+                # Metrics display
                 col1, col2, col3, col4, col5 = st.columns(5)
                 with col1:
                     st.metric("Total Chunks", metrics.get('total_chunks', 0))
                     overlap_pct = metrics.get('overlap_ratio', 0) * 100
                     st.metric("Overlap", f"{overlap_pct:.1f}%")
+                # Visualize chunks
+                visualizer.visualize_chunks(chunks)
+                # Size distribution chart
                 if len(chunks) > 1:
                     sizes = [chunk['char_count'] for chunk in chunks]
                     fig = px.histogram(
                         labels={'x': 'Characters', 'y': 'Count'}
                     )
                     fig.update_layout(height=300)
+                    st.plotly_chart(fig, width='stretch')
+        # Comparison tab
         with tabs[-1]:
             st.header("📈 Comprehensive Analysis")
+            # Comparison charts
+            visualizer.create_comparison_charts(all_results)
+            # Metrics table
             st.subheader("📊 Detailed Metrics Comparison")
             comparison_data = []
             for method, chunks in all_results.items():
+                metrics = visualizer.calculate_metrics(chunks)
                 comparison_data.append({
                     'Method': method,
                     'Chunks': metrics.get('total_chunks', 0),
                 })
             df_comparison = pd.DataFrame(comparison_data)
+            st.dataframe(df_comparison, width='stretch')
+            # Recommendations
+            st.subheader("💡 Recommendations")
             best_consistency = max(all_results.keys(),
+                                 key=lambda m: visualizer.calculate_metrics(all_results[m]).get('size_consistency', 0))
             optimal_size_method = min(all_results.keys(),
+                                    key=lambda m: abs(visualizer.calculate_metrics(all_results[m]).get('avg_chars', 1000) - 600))
             col1, col2 = st.columns(2)
             with col1:
                 st.success(f"🎯 **Most Consistent**: {best_consistency}")
+                consistency_score = visualizer.calculate_metrics(all_results[best_consistency]).get('size_consistency', 0)
                 st.write(f"Consistency score: {consistency_score:.3f}")
             with col2:
                 st.info(f"⚖️ **Optimal Size**: {optimal_size_method}")
+                avg_size = visualizer.calculate_metrics(all_results[optimal_size_method]).get('avg_chars', 0)
                 st.write(f"Average size: {avg_size:.0f} characters")
+            # Use case recommendations
             st.markdown("### 💡 Use Case Recommendations")
             recommendations = {
             for use_case, recommendation in recommendations.items():
                 st.markdown(f"- {use_case}: {recommendation}")
     else:
+        # Welcome screen when no text is provided
         st.markdown("""
+        ## 👋 Welcome to the RAG Chunk Visualizer
         This tool analyzes how different chunking strategies split your documents for RAG systems.
+        ### 🚀 Getting Started
+        **Step 1:** Choose your input method in the sidebar:
+        - **📁 Upload File**: Support for PDF, Excel, CSV, Word, and text files
+        - **✏️ Custom Input**: Paste or type your own text
+        **Step 2:** Select chunking methods to compare (2-3 recommended)
+        **Step 3:** Adjust parameters for each method
+        **Step 4:** Analyze results with comprehensive metrics and visualizations
+        ### 🔧 Available Chunking Methods
         - **Fixed Size**: Consistent character-based chunks with word boundaries
+        - **Sentence-based**: Natural language flow with sentence grouping
         - **Paragraph-based**: Document structure preservation
         - **Recursive**: Hierarchical splitting with multiple separators
+        ### 🎯 Key Features
+        - **Real-time comparison** of different chunking strategies
+        - **Advanced metrics** including consistency scores and overlap analysis
+        - **Interactive visualizations** with detailed chunk inspection
+        - **Professional recommendations** for different use cases
+        - **Multi-format support** for various document types
+        ### 📁 Supported File Formats
+        - **📄 PDF**: Research papers, reports, documentation
+        - **📊 Excel (XLSX/XLS)**: Spreadsheets, data tables, financial reports
+        - **📋 CSV**: Data exports, logs, structured datasets
+        - **📝 Word (DOCX)**: Business documents, proposals, manuscripts
+        - **📜 Text (TXT)**: Plain text files, code, notes
+        ---
+        **Ready to begin?** Select your input method in the sidebar! 👈
         """)
+        # Show example use cases
+        st.subheader("💡 Example Use Cases")
         col1, col2, col3 = st.columns(3)
         with col1:
             st.markdown("""
+            **🔍 RAG Optimization**
+            - Find optimal chunk sizes
+            - Minimize overlap issues
+            - Improve retrieval accuracy
+            - Balance context vs precision
             """)
         with col2:
             st.markdown("""
+            **📚 Document Processing**
+            - Preserve document structure
+            - Handle different file formats
+            - Maintain readability
+            - Process large documents
             """)
         with col3:
             st.markdown("""
+            **🤖 LLM Integration**
+            - Manage token limits
+            - Optimize context windows
+            - Improve response quality
+            - Reduce processing costs
             """)
 if __name__ == "__main__":