LvMAC commited on
Commit
ae42893
Β·
verified Β·
1 Parent(s): 45ff068

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +202 -242
src/streamlit_app.py CHANGED
@@ -8,15 +8,11 @@ import pandas as pd
8
  import io
9
  import time
10
  from typing import List, Dict, Any
11
- import PyPDF2
12
- import openpyxl
13
- from docx import Document
14
- import csv
15
 
16
  # Safe model loading without cache permission issues
17
  @st.cache_resource
18
  def load_sentence_transformer():
19
- st.info("⚠️ Semantic chunking disabled in HuggingFace environment")
20
  return None
21
 
22
  @st.cache_resource
@@ -29,12 +25,12 @@ def load_nltk():
29
  try:
30
  nltk.download('punkt', quiet=True)
31
  except:
32
- pass # Skip if download fails
33
  return nltk
34
  except ImportError:
35
  return None
36
 
37
- class ProductionChunkVisualizer:
38
  def __init__(self):
39
  self.colors = [
40
  '#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57',
@@ -54,7 +50,7 @@ class ProductionChunkVisualizer:
54
  def extract_text_from_pdf(self, pdf_file):
55
  """Extract text from PDF file"""
56
  try:
57
- # Reset file pointer to beginning
58
  pdf_file.seek(0)
59
  pdf_reader = PyPDF2.PdfReader(pdf_file)
60
  text = ""
@@ -70,21 +66,19 @@ class ProductionChunkVisualizer:
70
  st.warning(f"Could not extract text from page {page_num + 1}: {str(e)}")
71
 
72
  if not text.strip():
73
- st.warning("PDF appears to be image-based or empty. No text extracted.")
74
  return "No extractable text found in PDF document."
75
 
76
  return text.strip()
77
  except Exception as e:
78
  st.error(f"Error reading PDF: {str(e)}")
79
- return f"PDF processing error: {str(e)}"
80
 
81
  def extract_text_from_excel(self, excel_file):
82
  """Extract text from Excel file"""
83
  try:
84
- # Reset file pointer to beginning
85
  excel_file.seek(0)
86
 
87
- # Try different engines
88
  try:
89
  xl_data = pd.read_excel(excel_file, sheet_name=None, engine='openpyxl')
90
  except:
@@ -101,13 +95,11 @@ class ProductionChunkVisualizer:
101
  text += f"\n=== Sheet: {sheet_name} ===\n"
102
 
103
  if not df.empty:
104
- # Add column headers
105
  headers = " | ".join(str(col) for col in df.columns)
106
  text += f"Headers: {headers}\n"
107
  text += "-" * 50 + "\n"
108
 
109
- # Add data rows (limit to prevent massive output)
110
- max_rows = min(100, len(df)) # Limit to 100 rows per sheet
111
  for idx, row in df.head(max_rows).iterrows():
112
  row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
113
  text += row_text + "\n"
@@ -116,21 +108,18 @@ class ProductionChunkVisualizer:
116
  text += f"... ({len(df) - max_rows} more rows)\n"
117
  else:
118
  text += "Empty sheet\n"
119
-
120
  text += "\n"
121
 
122
  return text.strip()
123
  except Exception as e:
124
  st.error(f"Error reading Excel file: {str(e)}")
125
- return f"Excel processing error: {str(e)}"
126
 
127
  def extract_text_from_csv(self, csv_file):
128
  """Extract text from CSV file"""
129
  try:
130
- # Reset file pointer to beginning
131
  csv_file.seek(0)
132
 
133
- # Try different encodings
134
  for encoding in ['utf-8', 'latin-1', 'cp1252']:
135
  try:
136
  csv_file.seek(0)
@@ -139,20 +128,18 @@ class ProductionChunkVisualizer:
139
  except UnicodeDecodeError:
140
  continue
141
  else:
142
- df = pd.read_csv(csv_file) # Default encoding
143
 
144
  if df.empty:
145
  return "Empty CSV file"
146
 
147
  st.write(f"πŸ“‹ Processing CSV with {len(df)} rows and {len(df.columns)} columns...")
148
 
149
- # Create readable text format
150
  text = "=== CSV Data ===\n"
151
  headers = " | ".join(str(col) for col in df.columns)
152
  text += f"Headers: {headers}\n"
153
  text += "-" * 50 + "\n"
154
 
155
- # Limit rows to prevent massive output
156
  max_rows = min(100, len(df))
157
  for _, row in df.head(max_rows).iterrows():
158
  row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
@@ -164,11 +151,13 @@ class ProductionChunkVisualizer:
164
  return text.strip()
165
  except Exception as e:
166
  st.error(f"Error reading CSV file: {str(e)}")
167
- return f"CSV processing error: {str(e)}"
168
 
169
  def extract_text_from_docx(self, docx_file):
170
  """Extract text from Word document"""
171
  try:
 
 
172
  doc = Document(docx_file)
173
  text = ""
174
 
@@ -176,7 +165,6 @@ class ProductionChunkVisualizer:
176
  if paragraph.text.strip():
177
  text += paragraph.text + "\n"
178
 
179
- # Also extract text from tables
180
  for table in doc.tables:
181
  text += "\n=== Table ===\n"
182
  for row in table.rows:
@@ -215,7 +203,6 @@ class ProductionChunkVisualizer:
215
  chunk = text[start:]
216
  else:
217
  chunk = text[start:end]
218
- # Find last complete word
219
  if not text[end].isspace():
220
  last_space = chunk.rfind(' ')
221
  if last_space > chunk_size * 0.7:
@@ -248,7 +235,6 @@ class ProductionChunkVisualizer:
248
  chunk_sentences = sentences[i:i + sentences_per_chunk]
249
  chunk_text = ' '.join(chunk_sentences)
250
 
251
- # Find actual position in original text
252
  start_pos = text.find(chunk_sentences[0], current_pos)
253
  if start_pos == -1:
254
  start_pos = current_pos
@@ -297,11 +283,6 @@ class ProductionChunkVisualizer:
297
 
298
  return chunks
299
 
300
- def semantic_chunking(self, text: str, similarity_threshold: float = 0.5) -> List[Dict]:
301
- """Disabled semantic chunking - fallback to sentence-based"""
302
- st.warning("Semantic chunking unavailable in this environment. Using sentence-based fallback.")
303
- return self.sentence_chunking(text, 3)
304
-
305
  def recursive_chunking(self, text: str, max_chunk_size: int = 1000) -> List[Dict]:
306
  """Hierarchical text splitting with multiple separators"""
307
  separators = ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " "]
@@ -368,7 +349,7 @@ class ProductionChunkVisualizer:
368
 
369
  return chunks
370
 
371
- def calculate_advanced_metrics(self, chunks: List[Dict]) -> Dict[str, Any]:
372
  """Calculate comprehensive chunk metrics"""
373
  if not chunks:
374
  return {}
@@ -384,7 +365,6 @@ class ProductionChunkVisualizer:
384
  overlap_ratio = max(0, (total_chars - text_length) / text_length)
385
 
386
  char_cv = np.std(char_counts) / np.mean(char_counts) if np.mean(char_counts) > 0 else 0
387
- word_cv = np.std(word_counts) / np.mean(word_counts) if np.mean(word_counts) > 0 else 0
388
 
389
  return {
390
  'total_chunks': len(chunks),
@@ -395,25 +375,22 @@ class ProductionChunkVisualizer:
395
  'avg_words': np.mean(word_counts),
396
  'std_words': np.std(word_counts),
397
  'char_cv': char_cv,
398
- 'word_cv': word_cv,
399
  'overlap_ratio': overlap_ratio,
400
  'size_consistency': 1 - char_cv,
401
  'total_coverage': sum(chunk['end'] - chunk['start'] for chunk in chunks)
402
  }
403
 
404
- def visualize_chunks_advanced(self, text: str, chunks: List[Dict]):
405
- """Advanced chunk visualization"""
406
  if not chunks:
407
  st.write("No chunks to display")
408
  return
409
 
410
- st.markdown("### 🎨 Interactive Chunk Visualization")
411
 
412
  for i, chunk in enumerate(chunks):
413
  color = self.colors[i % len(self.colors)]
414
 
415
- words_per_sentence = chunk['word_count'] / max(1, chunk.get('sentence_count', 1))
416
-
417
  st.markdown(f"""
418
  <div style='background: linear-gradient(135deg, {color}15, {color}25);
419
  border-left: 5px solid {color};
@@ -432,13 +409,10 @@ class ProductionChunkVisualizer:
432
  <div style='color: #333; line-height: 1.6; font-size: 14px;'>
433
  {chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}
434
  </div>
435
- <div style='margin-top: 8px; color: #888; font-size: 11px;'>
436
- Quality: {words_per_sentence:.1f} words/sentence
437
- </div>
438
  </div>
439
  """, unsafe_allow_html=True)
440
 
441
- def create_comprehensive_charts(self, all_results: Dict[str, List[Dict]]):
442
  """Create detailed analysis charts"""
443
  if not all_results:
444
  return
@@ -447,7 +421,7 @@ class ProductionChunkVisualizer:
447
  size_data = []
448
 
449
  for method, chunks in all_results.items():
450
- metrics = self.calculate_advanced_metrics(chunks)
451
  metrics_data.append({
452
  'Method': method,
453
  'Chunks': metrics.get('total_chunks', 0),
@@ -518,16 +492,17 @@ class ProductionChunkVisualizer:
518
 
519
  def main():
520
  st.set_page_config(
521
- page_title="Multi-Format RAG Chunk Visualizer",
522
  page_icon="πŸ”",
523
  layout="wide",
524
  initial_sidebar_state="expanded"
525
  )
526
 
 
527
  col1, col2 = st.columns([3, 1])
528
  with col1:
529
- st.title("πŸ” Multi-Format RAG Chunk Visualizer")
530
- st.markdown("**Professional chunking analysis with support for PDF, Excel, CSV, Word & Text files**")
531
 
532
  with col2:
533
  if st.button("ℹ️ About", help="Learn about chunking strategies"):
@@ -537,122 +512,146 @@ def main():
537
  **Sentence-based**: Groups sentences together for semantic coherence
538
  **Paragraph-based**: Respects document structure and topic boundaries
539
  **Recursive**: Hierarchical splitting using multiple separators
540
-
541
- *Note: Semantic chunking disabled in this environment*
542
  """)
543
 
544
- visualizer = ProductionChunkVisualizer()
545
 
 
546
  with st.sidebar:
547
  st.header("βš™οΈ Configuration")
548
 
 
549
  input_method = st.radio(
550
  "Choose input method:",
551
- ["πŸ“ Sample Text", "πŸ“ Upload File", "✏️ Custom Input"],
552
  help="Select how you want to provide text for analysis"
553
  )
554
 
555
- sample_texts = {
556
- "Research Paper Abstract": """Machine learning has fundamentally transformed the landscape of artificial intelligence research. Recent advances in deep learning architectures, particularly transformer-based models, have demonstrated unprecedented capabilities in natural language understanding and generation. These models leverage attention mechanisms to capture long-range dependencies in sequential data, enabling more sophisticated reasoning and contextual understanding. The implications extend beyond traditional NLP tasks to multimodal applications, including vision-language models and cross-modal reasoning systems. However, significant challenges remain in terms of computational efficiency, interpretability, and robustness to adversarial inputs.""",
557
-
558
- "Technical Documentation": """Installation Prerequisites: Before beginning the installation process, ensure your system meets the following requirements. Python 3.8 or higher must be installed with pip package manager available. Node.js version 16.x or later is required for frontend dependencies. Git version control system should be accessible from command line.\n\nStep 1: Repository Setup\nClone the project repository using the following command: git clone https://github.com/company/rag-system.git. Navigate to the project directory and create a virtual environment: python -m venv rag-env. Activate the virtual environment using the appropriate command for your operating system.\n\nStep 2: Dependency Installation\nInstall Python dependencies by running pip install -r requirements.txt. This will install all necessary packages including transformers, sentence-transformers, and streamlit. For development dependencies, additionally run pip install -r requirements-dev.txt.""",
559
-
560
- "Business Report": """Executive Summary: Q4 2024 Performance Analysis\n\nOur organization achieved exceptional growth in the fourth quarter of 2024, with revenue increasing by 42% year-over-year to reach $3.8 million. This growth was primarily driven by our expanded product portfolio and successful market penetration strategies in the enterprise segment.\n\nKey Performance Indicators demonstrate strong momentum across all business units. Customer acquisition costs decreased by 18% while customer lifetime value increased by 35%, indicating improved operational efficiency and customer satisfaction. Our newly launched AI-powered features contributed significantly to user engagement, with daily active users increasing by 67%.\n\nStrategic Initiatives for 2025 focus on international expansion and technology innovation. We plan to establish operations in three new markets: Germany, Japan, and Australia. Additionally, our R&D investment will increase by 50% to accelerate development of next-generation AI capabilities."""
561
- }
562
 
563
- if input_method == "πŸ“ Sample Text":
564
- selected_sample = st.selectbox("Select sample text:", list(sample_texts.keys()))
565
- text = sample_texts[selected_sample]
566
- st.text_area("Preview:", value=text[:200] + "...", height=100, disabled=True)
567
 
568
- elif input_method == "πŸ“ Upload File":
569
  uploaded_file = st.file_uploader(
570
- "Upload document",
571
  type=['txt', 'pdf', 'csv', 'xlsx', 'xls', 'docx'],
572
  help="Supports: TXT, PDF, CSV, Excel (XLSX/XLS), Word (DOCX)"
573
  )
574
 
575
- if uploaded_file:
576
- file_type = uploaded_file.type
 
 
 
 
 
 
 
 
 
577
 
578
  with st.spinner(f"Processing {uploaded_file.name}..."):
579
- if file_type == "text/plain":
580
- text = str(uploaded_file.read(), "utf-8")
581
- elif file_type == "application/pdf":
582
- text = visualizer.extract_text_from_pdf(uploaded_file)
583
- elif file_type == "text/csv":
584
- text = visualizer.extract_text_from_csv(uploaded_file)
585
- elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
586
- "application/vnd.ms-excel"]:
587
- text = visualizer.extract_text_from_excel(uploaded_file)
588
- elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
589
- text = visualizer.extract_text_from_docx(uploaded_file)
590
- else:
591
- st.error(f"Unsupported file type: {file_type}")
592
- text = sample_texts["Research Paper Abstract"]
 
 
 
 
 
 
 
 
 
 
 
593
 
 
594
  if text and len(text.strip()) > 0:
595
- st.success(f"βœ… Extracted {len(text)} characters from {uploaded_file.name}")
596
- if len(text) > 1000:
597
- st.text_area("Preview:", value=text[:500] + "...", height=100, disabled=True)
 
 
 
 
 
 
 
 
598
  else:
599
- st.error("No text could be extracted from the file")
600
- text = sample_texts["Research Paper Abstract"]
601
  else:
602
- text = sample_texts["Research Paper Abstract"]
603
- st.info("Using sample text until file is uploaded")
604
- else:
605
  text = st.text_area(
606
  "Enter your text:",
607
  height=200,
608
- value=sample_texts["Business Document"],
609
  help="Paste or type the text you want to analyze"
610
  )
611
 
612
- st.divider()
613
-
614
- st.subheader("πŸ”§ Chunking Methods")
615
-
616
- method_options = {
617
- 'Fixed Size': 'Character-based splitting with word boundaries',
618
- 'Sentence-based': 'Group by sentences for readability',
619
- 'Paragraph-based': 'Respect document structure',
620
- 'Recursive': 'Hierarchical splitting with multiple separators'
621
- }
622
-
623
- selected_methods = []
624
- for method, description in method_options.items():
625
- if st.checkbox(method, value=method in ['Fixed Size', 'Sentence-based'], help=description):
626
- selected_methods.append(method)
627
-
628
- if not selected_methods:
629
- st.warning("⚠️ Select at least one chunking method")
630
-
631
- st.divider()
632
-
633
- st.subheader("βš™οΈ Parameters")
634
-
635
- params = {}
636
-
637
- if 'Fixed Size' in selected_methods:
638
- st.markdown("**Fixed Size Settings**")
639
- params['chunk_size'] = st.slider("Chunk size (characters)", 200, 2000, 800, step=50)
640
- params['overlap'] = st.slider("Overlap (characters)", 0, 300, 100, step=25)
641
-
642
- if 'Sentence-based' in selected_methods:
643
- st.markdown("**Sentence-based Settings**")
644
- params['sentences_per_chunk'] = st.slider("Sentences per chunk", 1, 10, 4)
645
-
646
- if 'Recursive' in selected_methods:
647
- st.markdown("**Recursive Settings**")
648
- params['max_recursive_size'] = st.slider("Max chunk size", 500, 2000, 1200, step=100)
649
-
650
- with st.expander("πŸ”¬ Advanced Options"):
651
- show_overlap_analysis = st.checkbox("Show overlap analysis", value=True)
652
- show_quality_metrics = st.checkbox("Show quality metrics", value=True)
653
- export_results = st.checkbox("Enable result export", value=False)
 
 
654
 
655
- if text and selected_methods:
 
 
656
  with st.spinner("Processing chunks..."):
657
  all_results = {}
658
 
@@ -674,14 +673,17 @@ def main():
674
 
675
  all_results[method] = chunks
676
 
677
- st.success(f"βœ… Processed {len(text)} characters with {len(selected_methods)} methods")
678
 
 
679
  tabs = st.tabs([f"πŸ“Š {method}" for method in selected_methods] + ["πŸ“ˆ Comparison"])
680
 
 
681
  for i, (method, chunks) in enumerate(all_results.items()):
682
  with tabs[i]:
683
- metrics = visualizer.calculate_advanced_metrics(chunks)
684
 
 
685
  col1, col2, col3, col4, col5 = st.columns(5)
686
  with col1:
687
  st.metric("Total Chunks", metrics.get('total_chunks', 0))
@@ -695,8 +697,10 @@ def main():
695
  overlap_pct = metrics.get('overlap_ratio', 0) * 100
696
  st.metric("Overlap", f"{overlap_pct:.1f}%")
697
 
698
- visualizer.visualize_chunks_advanced(text, chunks)
 
699
 
 
700
  if len(chunks) > 1:
701
  sizes = [chunk['char_count'] for chunk in chunks]
702
  fig = px.histogram(
@@ -705,18 +709,21 @@ def main():
705
  labels={'x': 'Characters', 'y': 'Count'}
706
  )
707
  fig.update_layout(height=300)
708
- st.plotly_chart(fig, use_container_width=True)
709
 
 
710
  with tabs[-1]:
711
  st.header("πŸ“ˆ Comprehensive Analysis")
712
 
713
- visualizer.create_comprehensive_charts(all_results)
 
714
 
 
715
  st.subheader("πŸ“Š Detailed Metrics Comparison")
716
 
717
  comparison_data = []
718
  for method, chunks in all_results.items():
719
- metrics = visualizer.calculate_advanced_metrics(chunks)
720
  comparison_data.append({
721
  'Method': method,
722
  'Chunks': metrics.get('total_chunks', 0),
@@ -727,28 +734,30 @@ def main():
727
  })
728
 
729
  df_comparison = pd.DataFrame(comparison_data)
730
- st.dataframe(df_comparison, use_container_width=True)
731
 
732
- st.subheader("πŸ€– Intelligent Recommendations")
 
733
 
734
  best_consistency = max(all_results.keys(),
735
- key=lambda m: visualizer.calculate_advanced_metrics(all_results[m]).get('size_consistency', 0))
736
 
737
  optimal_size_method = min(all_results.keys(),
738
- key=lambda m: abs(visualizer.calculate_advanced_metrics(all_results[m]).get('avg_chars', 1000) - 600))
739
 
740
  col1, col2 = st.columns(2)
741
 
742
  with col1:
743
  st.success(f"🎯 **Most Consistent**: {best_consistency}")
744
- consistency_score = visualizer.calculate_advanced_metrics(all_results[best_consistency]).get('size_consistency', 0)
745
  st.write(f"Consistency score: {consistency_score:.3f}")
746
 
747
  with col2:
748
  st.info(f"βš–οΈ **Optimal Size**: {optimal_size_method}")
749
- avg_size = visualizer.calculate_advanced_metrics(all_results[optimal_size_method]).get('avg_chars', 0)
750
  st.write(f"Average size: {avg_size:.0f} characters")
751
 
 
752
  st.markdown("### πŸ’‘ Use Case Recommendations")
753
 
754
  recommendations = {
@@ -761,133 +770,84 @@ def main():
761
 
762
  for use_case, recommendation in recommendations.items():
763
  st.markdown(f"- {use_case}: {recommendation}")
764
-
765
- if export_results:
766
- st.subheader("πŸ“€ Export Results")
767
-
768
- report_data = {
769
- 'text_length': len(text),
770
- 'methods_used': list(all_results.keys()),
771
- 'parameters': params,
772
- 'results': {}
773
- }
774
-
775
- for method, chunks in all_results.items():
776
- metrics = visualizer.calculate_advanced_metrics(chunks)
777
- report_data['results'][method] = {
778
- 'chunks': len(chunks),
779
- 'metrics': metrics,
780
- 'chunk_details': chunks
781
- }
782
-
783
- import json
784
- report_json = json.dumps(report_data, indent=2, default=str)
785
-
786
- col1, col2 = st.columns(2)
787
-
788
- with col1:
789
- st.download_button(
790
- "πŸ“‹ Download Analysis Report (JSON)",
791
- data=report_json,
792
- file_name=f"chunk_analysis_{len(text)}_chars.json",
793
- mime="application/json"
794
- )
795
-
796
- with col2:
797
- markdown_report = f"""# Multi-Format Chunk Analysis Report
798
-
799
- ## Text Analysis
800
- - **Length**: {len(text):,} characters
801
- - **Methods**: {', '.join(all_results.keys())}
802
- - **Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}
803
-
804
- ## Results Summary
805
- """
806
-
807
- for method, chunks in all_results.items():
808
- metrics = visualizer.calculate_advanced_metrics(chunks)
809
- markdown_report += f"""
810
- ### {method} Method
811
- - **Chunks**: {metrics.get('total_chunks', 0)}
812
- - **Average Size**: {metrics.get('avg_chars', 0):.0f} characters
813
- - **Consistency**: {metrics.get('size_consistency', 0):.3f}
814
- - **Overlap**: {metrics.get('overlap_ratio', 0)*100:.1f}%
815
- """
816
-
817
- st.download_button(
818
- "πŸ“„ Download Summary (Markdown)",
819
- data=markdown_report,
820
- file_name=f"chunk_summary_{len(text)}_chars.md",
821
- mime="text/markdown"
822
- )
823
 
824
  else:
 
825
  st.markdown("""
826
- ## πŸ‘‹ Welcome to the Multi-Format RAG Chunk Visualizer
827
 
828
  This tool analyzes how different chunking strategies split your documents for RAG systems.
829
 
830
- ### πŸš€ Supported File Formats
831
- - **πŸ“„ PDF**: Research papers, reports, documentation
832
- - **πŸ“Š Excel (XLSX/XLS)**: Spreadsheets, data tables, financial reports
833
- - **πŸ“‹ CSV**: Data exports, logs, structured datasets
834
- - **πŸ“ Word (DOCX)**: Business documents, proposals, manuscripts
835
- - **πŸ“œ Text (TXT)**: Plain text files, code, notes
836
 
837
- ### 🎯 Key Features
838
- - **4 chunking strategies** with real-time comparison
839
- - **Advanced metrics** including consistency and overlap analysis
840
- - **Interactive visualizations** with detailed chunk inspection
841
- - **Export capabilities** for team collaboration
842
- - **Professional recommendations** for different use cases
843
 
844
- ### πŸ’‘ Quick Start
845
- 1. **Upload your file** or use sample text
846
- 2. **Select chunking methods** to compare (2-3 recommended)
847
- 3. **Adjust parameters** for each method
848
- 4. **Analyze results** with comprehensive metrics
 
 
849
 
850
- ### πŸ”§ Chunking Methods Available
851
  - **Fixed Size**: Consistent character-based chunks with word boundaries
852
- - **Sentence-based**: Natural language flow with sentence grouping
853
  - **Paragraph-based**: Document structure preservation
854
  - **Recursive**: Hierarchical splitting with multiple separators
855
 
856
- **Note**: Semantic chunking temporarily disabled in this environment
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
857
 
858
- Select your settings in the sidebar to begin analysis! πŸ‘ˆ
859
  """)
860
 
861
- # Sample file format examples
862
- st.subheader("πŸ“ Example Use Cases")
863
 
864
  col1, col2, col3 = st.columns(3)
865
 
866
  with col1:
867
  st.markdown("""
868
- **πŸ“„ PDF Files**
869
- - Research papers
870
- - Technical manuals
871
- - Legal documents
872
- - Reports and presentations
873
  """)
874
 
875
  with col2:
876
  st.markdown("""
877
- **πŸ“Š Excel/CSV Files**
878
- - Data tables
879
- - Survey results
880
- - Financial reports
881
- - Product catalogs
882
  """)
883
 
884
  with col3:
885
  st.markdown("""
886
- **πŸ“ Text/Word Files**
887
- - Articles and blogs
888
- - Meeting notes
889
- - Technical documentation
890
- - Business proposals
891
  """)
892
 
893
  if __name__ == "__main__":
 
8
  import io
9
  import time
10
  from typing import List, Dict, Any
 
 
 
 
11
 
12
  # Safe model loading without cache permission issues
13
  @st.cache_resource
14
  def load_sentence_transformer():
15
+ st.info("⚠️ Semantic chunking disabled in this environment")
16
  return None
17
 
18
  @st.cache_resource
 
25
  try:
26
  nltk.download('punkt', quiet=True)
27
  except:
28
+ pass
29
  return nltk
30
  except ImportError:
31
  return None
32
 
33
+ class ChunkVisualizer:
34
  def __init__(self):
35
  self.colors = [
36
  '#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57',
 
50
  def extract_text_from_pdf(self, pdf_file):
51
  """Extract text from PDF file"""
52
  try:
53
+ import PyPDF2
54
  pdf_file.seek(0)
55
  pdf_reader = PyPDF2.PdfReader(pdf_file)
56
  text = ""
 
66
  st.warning(f"Could not extract text from page {page_num + 1}: {str(e)}")
67
 
68
  if not text.strip():
69
+ st.warning("PDF appears to be image-based or empty.")
70
  return "No extractable text found in PDF document."
71
 
72
  return text.strip()
73
  except Exception as e:
74
  st.error(f"Error reading PDF: {str(e)}")
75
+ return ""
76
 
77
  def extract_text_from_excel(self, excel_file):
78
  """Extract text from Excel file"""
79
  try:
 
80
  excel_file.seek(0)
81
 
 
82
  try:
83
  xl_data = pd.read_excel(excel_file, sheet_name=None, engine='openpyxl')
84
  except:
 
95
  text += f"\n=== Sheet: {sheet_name} ===\n"
96
 
97
  if not df.empty:
 
98
  headers = " | ".join(str(col) for col in df.columns)
99
  text += f"Headers: {headers}\n"
100
  text += "-" * 50 + "\n"
101
 
102
+ max_rows = min(100, len(df))
 
103
  for idx, row in df.head(max_rows).iterrows():
104
  row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
105
  text += row_text + "\n"
 
108
  text += f"... ({len(df) - max_rows} more rows)\n"
109
  else:
110
  text += "Empty sheet\n"
 
111
  text += "\n"
112
 
113
  return text.strip()
114
  except Exception as e:
115
  st.error(f"Error reading Excel file: {str(e)}")
116
+ return ""
117
 
118
  def extract_text_from_csv(self, csv_file):
119
  """Extract text from CSV file"""
120
  try:
 
121
  csv_file.seek(0)
122
 
 
123
  for encoding in ['utf-8', 'latin-1', 'cp1252']:
124
  try:
125
  csv_file.seek(0)
 
128
  except UnicodeDecodeError:
129
  continue
130
  else:
131
+ df = pd.read_csv(csv_file)
132
 
133
  if df.empty:
134
  return "Empty CSV file"
135
 
136
  st.write(f"πŸ“‹ Processing CSV with {len(df)} rows and {len(df.columns)} columns...")
137
 
 
138
  text = "=== CSV Data ===\n"
139
  headers = " | ".join(str(col) for col in df.columns)
140
  text += f"Headers: {headers}\n"
141
  text += "-" * 50 + "\n"
142
 
 
143
  max_rows = min(100, len(df))
144
  for _, row in df.head(max_rows).iterrows():
145
  row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
 
151
  return text.strip()
152
  except Exception as e:
153
  st.error(f"Error reading CSV file: {str(e)}")
154
+ return ""
155
 
156
  def extract_text_from_docx(self, docx_file):
157
  """Extract text from Word document"""
158
  try:
159
+ from docx import Document
160
+ docx_file.seek(0)
161
  doc = Document(docx_file)
162
  text = ""
163
 
 
165
  if paragraph.text.strip():
166
  text += paragraph.text + "\n"
167
 
 
168
  for table in doc.tables:
169
  text += "\n=== Table ===\n"
170
  for row in table.rows:
 
203
  chunk = text[start:]
204
  else:
205
  chunk = text[start:end]
 
206
  if not text[end].isspace():
207
  last_space = chunk.rfind(' ')
208
  if last_space > chunk_size * 0.7:
 
235
  chunk_sentences = sentences[i:i + sentences_per_chunk]
236
  chunk_text = ' '.join(chunk_sentences)
237
 
 
238
  start_pos = text.find(chunk_sentences[0], current_pos)
239
  if start_pos == -1:
240
  start_pos = current_pos
 
283
 
284
  return chunks
285
 
 
 
 
 
 
286
  def recursive_chunking(self, text: str, max_chunk_size: int = 1000) -> List[Dict]:
287
  """Hierarchical text splitting with multiple separators"""
288
  separators = ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " "]
 
349
 
350
  return chunks
351
 
352
+ def calculate_metrics(self, chunks: List[Dict]) -> Dict[str, Any]:
353
  """Calculate comprehensive chunk metrics"""
354
  if not chunks:
355
  return {}
 
365
  overlap_ratio = max(0, (total_chars - text_length) / text_length)
366
 
367
  char_cv = np.std(char_counts) / np.mean(char_counts) if np.mean(char_counts) > 0 else 0
 
368
 
369
  return {
370
  'total_chunks': len(chunks),
 
375
  'avg_words': np.mean(word_counts),
376
  'std_words': np.std(word_counts),
377
  'char_cv': char_cv,
 
378
  'overlap_ratio': overlap_ratio,
379
  'size_consistency': 1 - char_cv,
380
  'total_coverage': sum(chunk['end'] - chunk['start'] for chunk in chunks)
381
  }
382
 
383
+ def visualize_chunks(self, chunks: List[Dict]):
384
+ """Display chunks with color coding"""
385
  if not chunks:
386
  st.write("No chunks to display")
387
  return
388
 
389
+ st.markdown("### 🎨 Chunk Visualization")
390
 
391
  for i, chunk in enumerate(chunks):
392
  color = self.colors[i % len(self.colors)]
393
 
 
 
394
  st.markdown(f"""
395
  <div style='background: linear-gradient(135deg, {color}15, {color}25);
396
  border-left: 5px solid {color};
 
409
  <div style='color: #333; line-height: 1.6; font-size: 14px;'>
410
  {chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}
411
  </div>
 
 
 
412
  </div>
413
  """, unsafe_allow_html=True)
414
 
415
+ def create_comparison_charts(self, all_results: Dict[str, List[Dict]]):
416
  """Create detailed analysis charts"""
417
  if not all_results:
418
  return
 
421
  size_data = []
422
 
423
  for method, chunks in all_results.items():
424
+ metrics = self.calculate_metrics(chunks)
425
  metrics_data.append({
426
  'Method': method,
427
  'Chunks': metrics.get('total_chunks', 0),
 
492
 
493
  def main():
494
  st.set_page_config(
495
+ page_title="RAG Chunk Visualizer",
496
  page_icon="πŸ”",
497
  layout="wide",
498
  initial_sidebar_state="expanded"
499
  )
500
 
501
+ # Header
502
  col1, col2 = st.columns([3, 1])
503
  with col1:
504
+ st.title("πŸ” RAG Chunk Visualizer")
505
+ st.markdown("**Professional chunking analysis for RAG systems**")
506
 
507
  with col2:
508
  if st.button("ℹ️ About", help="Learn about chunking strategies"):
 
512
  **Sentence-based**: Groups sentences together for semantic coherence
513
  **Paragraph-based**: Respects document structure and topic boundaries
514
  **Recursive**: Hierarchical splitting using multiple separators
 
 
515
  """)
516
 
517
+ visualizer = ChunkVisualizer()
518
 
519
+ # Sidebar for configuration
520
  with st.sidebar:
521
  st.header("βš™οΈ Configuration")
522
 
523
+ # Input method selection
524
  input_method = st.radio(
525
  "Choose input method:",
526
+ ["πŸ“ Upload File", "✏️ Custom Input"],
527
  help="Select how you want to provide text for analysis"
528
  )
529
 
530
+ # File upload or text input
531
+ text = ""
 
 
 
 
 
532
 
533
+ if input_method == "πŸ“ Upload File":
534
+ st.markdown("**File Upload**")
 
 
535
 
 
536
  uploaded_file = st.file_uploader(
537
+ "Choose a file",
538
  type=['txt', 'pdf', 'csv', 'xlsx', 'xls', 'docx'],
539
  help="Supports: TXT, PDF, CSV, Excel (XLSX/XLS), Word (DOCX)"
540
  )
541
 
542
+ if uploaded_file is not None:
543
+ st.success(f"πŸ“ File loaded: **{uploaded_file.name}**")
544
+
545
+ # Show file info
546
+ with st.expander("File Details", expanded=False):
547
+ st.write(f"**Name:** {uploaded_file.name}")
548
+ st.write(f"**Size:** {len(uploaded_file.getvalue()):,} bytes")
549
+ st.write(f"**Type:** {uploaded_file.type}")
550
+
551
+ # Process the file
552
+ file_name = uploaded_file.name.lower()
553
 
554
  with st.spinner(f"Processing {uploaded_file.name}..."):
555
+ try:
556
+ if file_name.endswith('.txt'):
557
+ uploaded_file.seek(0)
558
+ text = str(uploaded_file.read(), "utf-8")
559
+
560
+ elif file_name.endswith('.pdf'):
561
+ text = visualizer.extract_text_from_pdf(uploaded_file)
562
+
563
+ elif file_name.endswith('.csv'):
564
+ text = visualizer.extract_text_from_csv(uploaded_file)
565
+
566
+ elif file_name.endswith(('.xlsx', '.xls')):
567
+ text = visualizer.extract_text_from_excel(uploaded_file)
568
+
569
+ elif file_name.endswith('.docx'):
570
+ text = visualizer.extract_text_from_docx(uploaded_file)
571
+
572
+ else:
573
+ st.warning("Unsupported file type - trying as text...")
574
+ uploaded_file.seek(0)
575
+ text = str(uploaded_file.read(), "utf-8")
576
+
577
+ except Exception as e:
578
+ st.error(f"Error processing file: {str(e)}")
579
+ text = ""
580
 
581
+ # Show processing results
582
  if text and len(text.strip()) > 0:
583
+ st.success(f"βœ… Extracted {len(text):,} characters")
584
+
585
+ # Show preview
586
+ preview_text = text[:300] + "..." if len(text) > 300 else text
587
+ st.text_area(
588
+ "Content Preview:",
589
+ value=preview_text,
590
+ height=100,
591
+ disabled=True,
592
+ help="First 300 characters of extracted text"
593
+ )
594
  else:
595
+ st.error("❌ No text could be extracted from the file")
 
596
  else:
597
+ st.info("πŸ‘† Choose a file to upload")
598
+
599
+ else: # Custom Input
600
  text = st.text_area(
601
  "Enter your text:",
602
  height=200,
603
+ placeholder="Paste or type your text here to analyze different chunking strategies...",
604
  help="Paste or type the text you want to analyze"
605
  )
606
 
607
+ # Only show chunking options if we have text
608
+ if text and len(text.strip()) > 0:
609
+ st.divider()
610
+
611
+ # Method selection
612
+ st.subheader("πŸ”§ Chunking Methods")
613
+
614
+ method_options = {
615
+ 'Fixed Size': 'Character-based splitting with word boundaries',
616
+ 'Sentence-based': 'Group by sentences for readability',
617
+ 'Paragraph-based': 'Respect document structure',
618
+ 'Recursive': 'Hierarchical splitting with multiple separators'
619
+ }
620
+
621
+ selected_methods = []
622
+ for method, description in method_options.items():
623
+ if st.checkbox(method, value=method in ['Fixed Size', 'Sentence-based'], help=description):
624
+ selected_methods.append(method)
625
+
626
+ if not selected_methods:
627
+ st.warning("⚠️ Select at least one chunking method")
628
+
629
+ st.divider()
630
+
631
+ # Parameters
632
+ st.subheader("βš™οΈ Parameters")
633
+
634
+ params = {}
635
+
636
+ if 'Fixed Size' in selected_methods:
637
+ st.markdown("**Fixed Size Settings**")
638
+ params['chunk_size'] = st.slider("Chunk size (characters)", 200, 2000, 800, step=50)
639
+ params['overlap'] = st.slider("Overlap (characters)", 0, 300, 100, step=25)
640
+
641
+ if 'Sentence-based' in selected_methods:
642
+ st.markdown("**Sentence-based Settings**")
643
+ params['sentences_per_chunk'] = st.slider("Sentences per chunk", 1, 10, 4)
644
+
645
+ if 'Recursive' in selected_methods:
646
+ st.markdown("**Recursive Settings**")
647
+ params['max_recursive_size'] = st.slider("Max chunk size", 500, 2000, 1200, step=100)
648
+ else:
649
+ selected_methods = []
650
+ params = {}
651
 
652
+ # Main content area
653
+ if text and len(text.strip()) > 0 and selected_methods:
654
+ # Process text with selected methods
655
  with st.spinner("Processing chunks..."):
656
  all_results = {}
657
 
 
673
 
674
  all_results[method] = chunks
675
 
676
+ st.success(f"βœ… Processed {len(text):,} characters with {len(selected_methods)} methods")
677
 
678
+ # Display results in tabs
679
  tabs = st.tabs([f"πŸ“Š {method}" for method in selected_methods] + ["πŸ“ˆ Comparison"])
680
 
681
+ # Individual method tabs
682
  for i, (method, chunks) in enumerate(all_results.items()):
683
  with tabs[i]:
684
+ metrics = visualizer.calculate_metrics(chunks)
685
 
686
+ # Metrics display
687
  col1, col2, col3, col4, col5 = st.columns(5)
688
  with col1:
689
  st.metric("Total Chunks", metrics.get('total_chunks', 0))
 
697
  overlap_pct = metrics.get('overlap_ratio', 0) * 100
698
  st.metric("Overlap", f"{overlap_pct:.1f}%")
699
 
700
+ # Visualize chunks
701
+ visualizer.visualize_chunks(chunks)
702
 
703
+ # Size distribution chart
704
  if len(chunks) > 1:
705
  sizes = [chunk['char_count'] for chunk in chunks]
706
  fig = px.histogram(
 
709
  labels={'x': 'Characters', 'y': 'Count'}
710
  )
711
  fig.update_layout(height=300)
712
+ st.plotly_chart(fig, width='stretch')
713
 
714
+ # Comparison tab
715
  with tabs[-1]:
716
  st.header("πŸ“ˆ Comprehensive Analysis")
717
 
718
+ # Comparison charts
719
+ visualizer.create_comparison_charts(all_results)
720
 
721
+ # Metrics table
722
  st.subheader("πŸ“Š Detailed Metrics Comparison")
723
 
724
  comparison_data = []
725
  for method, chunks in all_results.items():
726
+ metrics = visualizer.calculate_metrics(chunks)
727
  comparison_data.append({
728
  'Method': method,
729
  'Chunks': metrics.get('total_chunks', 0),
 
734
  })
735
 
736
  df_comparison = pd.DataFrame(comparison_data)
737
+ st.dataframe(df_comparison, width='stretch')
738
 
739
+ # Recommendations
740
+ st.subheader("πŸ’‘ Recommendations")
741
 
742
  best_consistency = max(all_results.keys(),
743
+ key=lambda m: visualizer.calculate_metrics(all_results[m]).get('size_consistency', 0))
744
 
745
  optimal_size_method = min(all_results.keys(),
746
+ key=lambda m: abs(visualizer.calculate_metrics(all_results[m]).get('avg_chars', 1000) - 600))
747
 
748
  col1, col2 = st.columns(2)
749
 
750
  with col1:
751
  st.success(f"🎯 **Most Consistent**: {best_consistency}")
752
+ consistency_score = visualizer.calculate_metrics(all_results[best_consistency]).get('size_consistency', 0)
753
  st.write(f"Consistency score: {consistency_score:.3f}")
754
 
755
  with col2:
756
  st.info(f"βš–οΈ **Optimal Size**: {optimal_size_method}")
757
+ avg_size = visualizer.calculate_metrics(all_results[optimal_size_method]).get('avg_chars', 0)
758
  st.write(f"Average size: {avg_size:.0f} characters")
759
 
760
+ # Use case recommendations
761
  st.markdown("### πŸ’‘ Use Case Recommendations")
762
 
763
  recommendations = {
 
770
 
771
  for use_case, recommendation in recommendations.items():
772
  st.markdown(f"- {use_case}: {recommendation}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773
 
774
  else:
775
+ # Welcome screen when no text is provided
776
  st.markdown("""
777
+ ## πŸ‘‹ Welcome to the RAG Chunk Visualizer
778
 
779
  This tool analyzes how different chunking strategies split your documents for RAG systems.
780
 
781
+ ### πŸš€ Getting Started
 
 
 
 
 
782
 
783
+ **Step 1:** Choose your input method in the sidebar:
784
+ - **πŸ“ Upload File**: Support for PDF, Excel, CSV, Word, and text files
785
+ - **✏️ Custom Input**: Paste or type your own text
 
 
 
786
 
787
+ **Step 2:** Select chunking methods to compare (2-3 recommended)
788
+
789
+ **Step 3:** Adjust parameters for each method
790
+
791
+ **Step 4:** Analyze results with comprehensive metrics and visualizations
792
+
793
+ ### πŸ”§ Available Chunking Methods
794
 
 
795
  - **Fixed Size**: Consistent character-based chunks with word boundaries
796
+ - **Sentence-based**: Natural language flow with sentence grouping
797
  - **Paragraph-based**: Document structure preservation
798
  - **Recursive**: Hierarchical splitting with multiple separators
799
 
800
+ ### 🎯 Key Features
801
+
802
+ - **Real-time comparison** of different chunking strategies
803
+ - **Advanced metrics** including consistency scores and overlap analysis
804
+ - **Interactive visualizations** with detailed chunk inspection
805
+ - **Professional recommendations** for different use cases
806
+ - **Multi-format support** for various document types
807
+
808
+ ### πŸ“ Supported File Formats
809
+
810
+ - **πŸ“„ PDF**: Research papers, reports, documentation
811
+ - **πŸ“Š Excel (XLSX/XLS)**: Spreadsheets, data tables, financial reports
812
+ - **πŸ“‹ CSV**: Data exports, logs, structured datasets
813
+ - **πŸ“ Word (DOCX)**: Business documents, proposals, manuscripts
814
+ - **πŸ“œ Text (TXT)**: Plain text files, code, notes
815
+
816
+ ---
817
 
818
+ **Ready to begin?** Select your input method in the sidebar! πŸ‘ˆ
819
  """)
820
 
821
+ # Show example use cases
822
+ st.subheader("πŸ’‘ Example Use Cases")
823
 
824
  col1, col2, col3 = st.columns(3)
825
 
826
  with col1:
827
  st.markdown("""
828
+ **πŸ” RAG Optimization**
829
+ - Find optimal chunk sizes
830
+ - Minimize overlap issues
831
+ - Improve retrieval accuracy
832
+ - Balance context vs precision
833
  """)
834
 
835
  with col2:
836
  st.markdown("""
837
+ **πŸ“š Document Processing**
838
+ - Preserve document structure
839
+ - Handle different file formats
840
+ - Maintain readability
841
+ - Process large documents
842
  """)
843
 
844
  with col3:
845
  st.markdown("""
846
+ **πŸ€– LLM Integration**
847
+ - Manage token limits
848
+ - Optimize context windows
849
+ - Improve response quality
850
+ - Reduce processing costs
851
  """)
852
 
853
  if __name__ == "__main__":