Spaces:
Runtime error
Runtime error
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +885 -31
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,894 @@
|
|
| 1 |
-
import
|
|
|
|
| 2 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import re
|
| 3 |
import numpy as np
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
from plotly.subplots import make_subplots
|
| 7 |
import pandas as pd
|
| 8 |
+
import io
|
| 9 |
+
import time
|
| 10 |
+
from typing import List, Dict, Any
|
| 11 |
+
import PyPDF2
|
| 12 |
+
import openpyxl
|
| 13 |
+
from docx import Document
|
| 14 |
+
import csv
|
| 15 |
|
| 16 |
+
# Safe model loading without cache permission issues
|
| 17 |
+
@st.cache_resource
|
| 18 |
+
def load_sentence_transformer():
|
| 19 |
+
st.info("β οΈ Semantic chunking disabled in HuggingFace environment")
|
| 20 |
+
return None
|
| 21 |
|
| 22 |
+
@st.cache_resource
|
| 23 |
+
def load_nltk():
|
| 24 |
+
try:
|
| 25 |
+
import nltk
|
| 26 |
+
try:
|
| 27 |
+
nltk.data.find('tokenizers/punkt')
|
| 28 |
+
except LookupError:
|
| 29 |
+
try:
|
| 30 |
+
nltk.download('punkt', quiet=True)
|
| 31 |
+
except:
|
| 32 |
+
pass # Skip if download fails
|
| 33 |
+
return nltk
|
| 34 |
+
except ImportError:
|
| 35 |
+
return None
|
| 36 |
|
| 37 |
+
class ProductionChunkVisualizer:
|
| 38 |
+
def __init__(self):
|
| 39 |
+
self.colors = [
|
| 40 |
+
'#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57',
|
| 41 |
+
'#FD79A8', '#A29BFE', '#6C5CE7', '#74B9FF', '#00B894'
|
| 42 |
+
]
|
| 43 |
+
self.model = None
|
| 44 |
+
self.nltk = None
|
| 45 |
+
|
| 46 |
+
def initialize_models(self):
|
| 47 |
+
"""Lazy load models only when needed"""
|
| 48 |
+
if self.model is None:
|
| 49 |
+
self.model = load_sentence_transformer()
|
| 50 |
+
|
| 51 |
+
if self.nltk is None:
|
| 52 |
+
self.nltk = load_nltk()
|
| 53 |
+
|
| 54 |
+
def extract_text_from_pdf(self, pdf_file):
|
| 55 |
+
"""Extract text from PDF file"""
|
| 56 |
+
try:
|
| 57 |
+
# Reset file pointer to beginning
|
| 58 |
+
pdf_file.seek(0)
|
| 59 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 60 |
+
text = ""
|
| 61 |
+
|
| 62 |
+
st.write(f"π Processing PDF with {len(pdf_reader.pages)} pages...")
|
| 63 |
+
|
| 64 |
+
for page_num, page in enumerate(pdf_reader.pages):
|
| 65 |
+
try:
|
| 66 |
+
page_text = page.extract_text()
|
| 67 |
+
if page_text.strip():
|
| 68 |
+
text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
|
| 69 |
+
except Exception as e:
|
| 70 |
+
st.warning(f"Could not extract text from page {page_num + 1}: {str(e)}")
|
| 71 |
+
|
| 72 |
+
if not text.strip():
|
| 73 |
+
st.warning("PDF appears to be image-based or empty. No text extracted.")
|
| 74 |
+
return "No extractable text found in PDF document."
|
| 75 |
+
|
| 76 |
+
return text.strip()
|
| 77 |
+
except Exception as e:
|
| 78 |
+
st.error(f"Error reading PDF: {str(e)}")
|
| 79 |
+
return f"PDF processing error: {str(e)}"
|
| 80 |
+
|
| 81 |
+
def extract_text_from_excel(self, excel_file):
|
| 82 |
+
"""Extract text from Excel file"""
|
| 83 |
+
try:
|
| 84 |
+
# Reset file pointer to beginning
|
| 85 |
+
excel_file.seek(0)
|
| 86 |
+
|
| 87 |
+
# Try different engines
|
| 88 |
+
try:
|
| 89 |
+
xl_data = pd.read_excel(excel_file, sheet_name=None, engine='openpyxl')
|
| 90 |
+
except:
|
| 91 |
+
try:
|
| 92 |
+
xl_data = pd.read_excel(excel_file, sheet_name=None, engine='xlrd')
|
| 93 |
+
except:
|
| 94 |
+
xl_data = pd.read_excel(excel_file, sheet_name=None)
|
| 95 |
+
|
| 96 |
+
text = ""
|
| 97 |
+
sheet_count = len(xl_data)
|
| 98 |
+
st.write(f"π Processing Excel file with {sheet_count} sheet(s)...")
|
| 99 |
+
|
| 100 |
+
for sheet_name, df in xl_data.items():
|
| 101 |
+
text += f"\n=== Sheet: {sheet_name} ===\n"
|
| 102 |
+
|
| 103 |
+
if not df.empty:
|
| 104 |
+
# Add column headers
|
| 105 |
+
headers = " | ".join(str(col) for col in df.columns)
|
| 106 |
+
text += f"Headers: {headers}\n"
|
| 107 |
+
text += "-" * 50 + "\n"
|
| 108 |
+
|
| 109 |
+
# Add data rows (limit to prevent massive output)
|
| 110 |
+
max_rows = min(100, len(df)) # Limit to 100 rows per sheet
|
| 111 |
+
for idx, row in df.head(max_rows).iterrows():
|
| 112 |
+
row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
|
| 113 |
+
text += row_text + "\n"
|
| 114 |
+
|
| 115 |
+
if len(df) > max_rows:
|
| 116 |
+
text += f"... ({len(df) - max_rows} more rows)\n"
|
| 117 |
+
else:
|
| 118 |
+
text += "Empty sheet\n"
|
| 119 |
+
|
| 120 |
+
text += "\n"
|
| 121 |
+
|
| 122 |
+
return text.strip()
|
| 123 |
+
except Exception as e:
|
| 124 |
+
st.error(f"Error reading Excel file: {str(e)}")
|
| 125 |
+
return f"Excel processing error: {str(e)}"
|
| 126 |
+
|
| 127 |
+
def extract_text_from_csv(self, csv_file):
|
| 128 |
+
"""Extract text from CSV file"""
|
| 129 |
+
try:
|
| 130 |
+
# Reset file pointer to beginning
|
| 131 |
+
csv_file.seek(0)
|
| 132 |
+
|
| 133 |
+
# Try different encodings
|
| 134 |
+
for encoding in ['utf-8', 'latin-1', 'cp1252']:
|
| 135 |
+
try:
|
| 136 |
+
csv_file.seek(0)
|
| 137 |
+
df = pd.read_csv(csv_file, encoding=encoding)
|
| 138 |
+
break
|
| 139 |
+
except UnicodeDecodeError:
|
| 140 |
+
continue
|
| 141 |
+
else:
|
| 142 |
+
df = pd.read_csv(csv_file) # Default encoding
|
| 143 |
+
|
| 144 |
+
if df.empty:
|
| 145 |
+
return "Empty CSV file"
|
| 146 |
+
|
| 147 |
+
st.write(f"π Processing CSV with {len(df)} rows and {len(df.columns)} columns...")
|
| 148 |
+
|
| 149 |
+
# Create readable text format
|
| 150 |
+
text = "=== CSV Data ===\n"
|
| 151 |
+
headers = " | ".join(str(col) for col in df.columns)
|
| 152 |
+
text += f"Headers: {headers}\n"
|
| 153 |
+
text += "-" * 50 + "\n"
|
| 154 |
+
|
| 155 |
+
# Limit rows to prevent massive output
|
| 156 |
+
max_rows = min(100, len(df))
|
| 157 |
+
for _, row in df.head(max_rows).iterrows():
|
| 158 |
+
row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
|
| 159 |
+
text += row_text + "\n"
|
| 160 |
+
|
| 161 |
+
if len(df) > max_rows:
|
| 162 |
+
text += f"... ({len(df) - max_rows} more rows)\n"
|
| 163 |
+
|
| 164 |
+
return text.strip()
|
| 165 |
+
except Exception as e:
|
| 166 |
+
st.error(f"Error reading CSV file: {str(e)}")
|
| 167 |
+
return f"CSV processing error: {str(e)}"
|
| 168 |
+
|
| 169 |
+
def extract_text_from_docx(self, docx_file):
|
| 170 |
+
"""Extract text from Word document"""
|
| 171 |
+
try:
|
| 172 |
+
doc = Document(docx_file)
|
| 173 |
+
text = ""
|
| 174 |
+
|
| 175 |
+
for paragraph in doc.paragraphs:
|
| 176 |
+
if paragraph.text.strip():
|
| 177 |
+
text += paragraph.text + "\n"
|
| 178 |
+
|
| 179 |
+
# Also extract text from tables
|
| 180 |
+
for table in doc.tables:
|
| 181 |
+
text += "\n=== Table ===\n"
|
| 182 |
+
for row in table.rows:
|
| 183 |
+
row_text = " | ".join(cell.text.strip() for cell in row.cells)
|
| 184 |
+
text += row_text + "\n"
|
| 185 |
+
text += "\n"
|
| 186 |
+
|
| 187 |
+
return text.strip()
|
| 188 |
+
except Exception as e:
|
| 189 |
+
st.error(f"Error reading Word document: {str(e)}")
|
| 190 |
+
return ""
|
| 191 |
+
|
| 192 |
+
def simple_sentence_split(self, text: str) -> List[str]:
|
| 193 |
+
"""Fallback sentence splitting without NLTK"""
|
| 194 |
+
sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
|
| 195 |
+
return [s.strip() for s in sentences if s.strip()]
|
| 196 |
+
|
| 197 |
+
def robust_sentence_split(self, text: str) -> List[str]:
|
| 198 |
+
"""Use NLTK if available, fallback to regex"""
|
| 199 |
+
if self.nltk:
|
| 200 |
+
try:
|
| 201 |
+
return self.nltk.sent_tokenize(text)
|
| 202 |
+
except:
|
| 203 |
+
pass
|
| 204 |
+
return self.simple_sentence_split(text)
|
| 205 |
+
|
| 206 |
+
def fixed_size_chunking(self, text: str, chunk_size: int, overlap_size: int = 0) -> List[Dict]:
|
| 207 |
+
"""Split text into fixed-size chunks with word boundary respect"""
|
| 208 |
+
chunks = []
|
| 209 |
+
start = 0
|
| 210 |
+
|
| 211 |
+
while start < len(text):
|
| 212 |
+
end = start + chunk_size
|
| 213 |
+
|
| 214 |
+
if end >= len(text):
|
| 215 |
+
chunk = text[start:]
|
| 216 |
+
else:
|
| 217 |
+
chunk = text[start:end]
|
| 218 |
+
# Find last complete word
|
| 219 |
+
if not text[end].isspace():
|
| 220 |
+
last_space = chunk.rfind(' ')
|
| 221 |
+
if last_space > chunk_size * 0.7:
|
| 222 |
+
chunk = chunk[:last_space]
|
| 223 |
+
end = start + last_space
|
| 224 |
+
|
| 225 |
+
if chunk.strip():
|
| 226 |
+
chunks.append({
|
| 227 |
+
'text': chunk.strip(),
|
| 228 |
+
'start': start,
|
| 229 |
+
'end': end if end < len(text) else len(text),
|
| 230 |
+
'method': 'Fixed Size',
|
| 231 |
+
'word_count': len(chunk.split()),
|
| 232 |
+
'char_count': len(chunk.strip())
|
| 233 |
+
})
|
| 234 |
+
|
| 235 |
+
start = end - overlap_size
|
| 236 |
+
if start >= len(text):
|
| 237 |
+
break
|
| 238 |
+
|
| 239 |
+
return chunks
|
| 240 |
+
|
| 241 |
+
def sentence_chunking(self, text: str, sentences_per_chunk: int = 3) -> List[Dict]:
|
| 242 |
+
"""Split text into sentence-based chunks"""
|
| 243 |
+
sentences = self.robust_sentence_split(text)
|
| 244 |
+
chunks = []
|
| 245 |
+
current_pos = 0
|
| 246 |
+
|
| 247 |
+
for i in range(0, len(sentences), sentences_per_chunk):
|
| 248 |
+
chunk_sentences = sentences[i:i + sentences_per_chunk]
|
| 249 |
+
chunk_text = ' '.join(chunk_sentences)
|
| 250 |
+
|
| 251 |
+
# Find actual position in original text
|
| 252 |
+
start_pos = text.find(chunk_sentences[0], current_pos)
|
| 253 |
+
if start_pos == -1:
|
| 254 |
+
start_pos = current_pos
|
| 255 |
+
|
| 256 |
+
end_pos = start_pos + len(chunk_text)
|
| 257 |
+
current_pos = end_pos
|
| 258 |
+
|
| 259 |
+
chunks.append({
|
| 260 |
+
'text': chunk_text,
|
| 261 |
+
'start': start_pos,
|
| 262 |
+
'end': min(end_pos, len(text)),
|
| 263 |
+
'method': 'Sentence-based',
|
| 264 |
+
'sentence_count': len(chunk_sentences),
|
| 265 |
+
'word_count': len(chunk_text.split()),
|
| 266 |
+
'char_count': len(chunk_text)
|
| 267 |
+
})
|
| 268 |
+
|
| 269 |
+
return chunks
|
| 270 |
+
|
| 271 |
+
def paragraph_chunking(self, text: str) -> List[Dict]:
|
| 272 |
+
"""Split text by paragraph boundaries"""
|
| 273 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
| 274 |
+
chunks = []
|
| 275 |
+
current_pos = 0
|
| 276 |
+
|
| 277 |
+
for para in paragraphs:
|
| 278 |
+
para = para.strip()
|
| 279 |
+
if para:
|
| 280 |
+
start_pos = text.find(para, current_pos)
|
| 281 |
+
if start_pos == -1:
|
| 282 |
+
start_pos = current_pos
|
| 283 |
+
|
| 284 |
+
end_pos = start_pos + len(para)
|
| 285 |
+
|
| 286 |
+
chunks.append({
|
| 287 |
+
'text': para,
|
| 288 |
+
'start': start_pos,
|
| 289 |
+
'end': end_pos,
|
| 290 |
+
'method': 'Paragraph-based',
|
| 291 |
+
'paragraph_length': len(para),
|
| 292 |
+
'word_count': len(para.split()),
|
| 293 |
+
'char_count': len(para)
|
| 294 |
+
})
|
| 295 |
+
|
| 296 |
+
current_pos = end_pos
|
| 297 |
+
|
| 298 |
+
return chunks
|
| 299 |
+
|
| 300 |
+
def semantic_chunking(self, text: str, similarity_threshold: float = 0.5) -> List[Dict]:
|
| 301 |
+
"""Disabled semantic chunking - fallback to sentence-based"""
|
| 302 |
+
st.warning("Semantic chunking unavailable in this environment. Using sentence-based fallback.")
|
| 303 |
+
return self.sentence_chunking(text, 3)
|
| 304 |
+
|
| 305 |
+
def recursive_chunking(self, text: str, max_chunk_size: int = 1000) -> List[Dict]:
|
| 306 |
+
"""Hierarchical text splitting with multiple separators"""
|
| 307 |
+
separators = ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " "]
|
| 308 |
+
|
| 309 |
+
def _recursive_split(text: str, separators: List[str], max_size: int, depth: int = 0) -> List[str]:
|
| 310 |
+
if len(text) <= max_size or depth > len(separators):
|
| 311 |
+
return [text]
|
| 312 |
+
|
| 313 |
+
separator = separators[0] if separators else " "
|
| 314 |
+
|
| 315 |
+
if separator not in text:
|
| 316 |
+
if len(separators) > 1:
|
| 317 |
+
return _recursive_split(text, separators[1:], max_size, depth + 1)
|
| 318 |
+
else:
|
| 319 |
+
return [text[i:i+max_size] for i in range(0, len(text), max_size)]
|
| 320 |
+
|
| 321 |
+
parts = text.split(separator)
|
| 322 |
+
result = []
|
| 323 |
+
current_chunk = ""
|
| 324 |
+
|
| 325 |
+
for part in parts:
|
| 326 |
+
potential_chunk = current_chunk + part + separator
|
| 327 |
+
|
| 328 |
+
if len(potential_chunk) <= max_size:
|
| 329 |
+
current_chunk = potential_chunk
|
| 330 |
+
else:
|
| 331 |
+
if current_chunk:
|
| 332 |
+
result.append(current_chunk.rstrip(separator))
|
| 333 |
+
|
| 334 |
+
if len(part) > max_size:
|
| 335 |
+
result.extend(_recursive_split(part, separators[1:], max_size, depth + 1))
|
| 336 |
+
current_chunk = ""
|
| 337 |
+
else:
|
| 338 |
+
current_chunk = part + separator
|
| 339 |
+
|
| 340 |
+
if current_chunk:
|
| 341 |
+
result.append(current_chunk.rstrip(separator))
|
| 342 |
+
|
| 343 |
+
return result
|
| 344 |
+
|
| 345 |
+
split_texts = _recursive_split(text, separators, max_chunk_size)
|
| 346 |
+
chunks = []
|
| 347 |
+
current_pos = 0
|
| 348 |
+
|
| 349 |
+
for chunk_text in split_texts:
|
| 350 |
+
if chunk_text.strip():
|
| 351 |
+
start_pos = text.find(chunk_text, current_pos)
|
| 352 |
+
if start_pos == -1:
|
| 353 |
+
start_pos = current_pos
|
| 354 |
+
|
| 355 |
+
end_pos = start_pos + len(chunk_text)
|
| 356 |
+
|
| 357 |
+
chunks.append({
|
| 358 |
+
'text': chunk_text,
|
| 359 |
+
'start': start_pos,
|
| 360 |
+
'end': end_pos,
|
| 361 |
+
'method': 'Recursive',
|
| 362 |
+
'max_size': max_chunk_size,
|
| 363 |
+
'word_count': len(chunk_text.split()),
|
| 364 |
+
'char_count': len(chunk_text)
|
| 365 |
+
})
|
| 366 |
+
|
| 367 |
+
current_pos = end_pos
|
| 368 |
+
|
| 369 |
+
return chunks
|
| 370 |
+
|
| 371 |
+
def calculate_advanced_metrics(self, chunks: List[Dict]) -> Dict[str, Any]:
|
| 372 |
+
"""Calculate comprehensive chunk metrics"""
|
| 373 |
+
if not chunks:
|
| 374 |
+
return {}
|
| 375 |
+
|
| 376 |
+
char_counts = [chunk['char_count'] for chunk in chunks]
|
| 377 |
+
word_counts = [chunk['word_count'] for chunk in chunks]
|
| 378 |
+
|
| 379 |
+
overlap_ratio = 0
|
| 380 |
+
if len(chunks) > 1:
|
| 381 |
+
total_chars = sum(char_counts)
|
| 382 |
+
text_length = max(chunk['end'] for chunk in chunks)
|
| 383 |
+
if text_length > 0:
|
| 384 |
+
overlap_ratio = max(0, (total_chars - text_length) / text_length)
|
| 385 |
+
|
| 386 |
+
char_cv = np.std(char_counts) / np.mean(char_counts) if np.mean(char_counts) > 0 else 0
|
| 387 |
+
word_cv = np.std(word_counts) / np.mean(word_counts) if np.mean(word_counts) > 0 else 0
|
| 388 |
+
|
| 389 |
+
return {
|
| 390 |
+
'total_chunks': len(chunks),
|
| 391 |
+
'avg_chars': np.mean(char_counts),
|
| 392 |
+
'std_chars': np.std(char_counts),
|
| 393 |
+
'min_chars': min(char_counts),
|
| 394 |
+
'max_chars': max(char_counts),
|
| 395 |
+
'avg_words': np.mean(word_counts),
|
| 396 |
+
'std_words': np.std(word_counts),
|
| 397 |
+
'char_cv': char_cv,
|
| 398 |
+
'word_cv': word_cv,
|
| 399 |
+
'overlap_ratio': overlap_ratio,
|
| 400 |
+
'size_consistency': 1 - char_cv,
|
| 401 |
+
'total_coverage': sum(chunk['end'] - chunk['start'] for chunk in chunks)
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
def visualize_chunks_advanced(self, text: str, chunks: List[Dict]):
|
| 405 |
+
"""Advanced chunk visualization"""
|
| 406 |
+
if not chunks:
|
| 407 |
+
st.write("No chunks to display")
|
| 408 |
+
return
|
| 409 |
+
|
| 410 |
+
st.markdown("### π¨ Interactive Chunk Visualization")
|
| 411 |
+
|
| 412 |
+
for i, chunk in enumerate(chunks):
|
| 413 |
+
color = self.colors[i % len(self.colors)]
|
| 414 |
+
|
| 415 |
+
words_per_sentence = chunk['word_count'] / max(1, chunk.get('sentence_count', 1))
|
| 416 |
+
|
| 417 |
+
st.markdown(f"""
|
| 418 |
+
<div style='background: linear-gradient(135deg, {color}15, {color}25);
|
| 419 |
+
border-left: 5px solid {color};
|
| 420 |
+
padding: 15px;
|
| 421 |
+
margin: 10px 0;
|
| 422 |
+
border-radius: 8px;
|
| 423 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
|
| 424 |
+
<div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;'>
|
| 425 |
+
<div style='color: {color}; font-weight: bold; font-size: 14px;'>
|
| 426 |
+
CHUNK {i+1} β’ Position {chunk['start']}-{chunk['end']}
|
| 427 |
+
</div>
|
| 428 |
+
<div style='color: #666; font-size: 12px;'>
|
| 429 |
+
{chunk['char_count']} chars β’ {chunk['word_count']} words
|
| 430 |
+
</div>
|
| 431 |
+
</div>
|
| 432 |
+
<div style='color: #333; line-height: 1.6; font-size: 14px;'>
|
| 433 |
+
{chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}
|
| 434 |
+
</div>
|
| 435 |
+
<div style='margin-top: 8px; color: #888; font-size: 11px;'>
|
| 436 |
+
Quality: {words_per_sentence:.1f} words/sentence
|
| 437 |
+
</div>
|
| 438 |
+
</div>
|
| 439 |
+
""", unsafe_allow_html=True)
|
| 440 |
+
|
| 441 |
+
def create_comprehensive_charts(self, all_results: Dict[str, List[Dict]]):
|
| 442 |
+
"""Create detailed analysis charts"""
|
| 443 |
+
if not all_results:
|
| 444 |
+
return
|
| 445 |
+
|
| 446 |
+
metrics_data = []
|
| 447 |
+
size_data = []
|
| 448 |
+
|
| 449 |
+
for method, chunks in all_results.items():
|
| 450 |
+
metrics = self.calculate_advanced_metrics(chunks)
|
| 451 |
+
metrics_data.append({
|
| 452 |
+
'Method': method,
|
| 453 |
+
'Chunks': metrics.get('total_chunks', 0),
|
| 454 |
+
'Avg Size': metrics.get('avg_chars', 0),
|
| 455 |
+
'Consistency': metrics.get('size_consistency', 0),
|
| 456 |
+
'Overlap': metrics.get('overlap_ratio', 0)
|
| 457 |
+
})
|
| 458 |
+
|
| 459 |
+
for chunk in chunks:
|
| 460 |
+
size_data.append({
|
| 461 |
+
'Method': method,
|
| 462 |
+
'Size': chunk['char_count'],
|
| 463 |
+
'Words': chunk['word_count']
|
| 464 |
+
})
|
| 465 |
+
|
| 466 |
+
fig = make_subplots(
|
| 467 |
+
rows=2, cols=2,
|
| 468 |
+
subplot_titles=(
|
| 469 |
+
'Chunk Count Comparison',
|
| 470 |
+
'Size Consistency',
|
| 471 |
+
'Size Distribution by Method',
|
| 472 |
+
'Words vs Characters'
|
| 473 |
+
),
|
| 474 |
+
specs=[
|
| 475 |
+
[{"type": "bar"}, {"type": "bar"}],
|
| 476 |
+
[{"type": "box"}, {"type": "scatter"}]
|
| 477 |
+
]
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
df_metrics = pd.DataFrame(metrics_data)
|
| 481 |
+
df_sizes = pd.DataFrame(size_data)
|
| 482 |
+
|
| 483 |
+
# Chart 1: Chunk counts
|
| 484 |
+
fig.add_trace(
|
| 485 |
+
go.Bar(x=df_metrics['Method'], y=df_metrics['Chunks'],
|
| 486 |
+
name='Chunk Count', marker_color='lightblue'),
|
| 487 |
+
row=1, col=1
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
# Chart 2: Consistency scores
|
| 491 |
+
fig.add_trace(
|
| 492 |
+
go.Bar(x=df_metrics['Method'], y=df_metrics['Consistency'],
|
| 493 |
+
name='Consistency', marker_color='lightgreen'),
|
| 494 |
+
row=1, col=2
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
# Chart 3: Size distribution box plots
|
| 498 |
+
for method in df_sizes['Method'].unique():
|
| 499 |
+
method_data = df_sizes[df_sizes['Method'] == method]
|
| 500 |
+
fig.add_trace(
|
| 501 |
+
go.Box(y=method_data['Size'], name=method, boxpoints='outliers'),
|
| 502 |
+
row=2, col=1
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
# Chart 4: Words vs Characters scatter
|
| 506 |
+
for method in df_sizes['Method'].unique():
|
| 507 |
+
method_data = df_sizes[df_sizes['Method'] == method]
|
| 508 |
+
fig.add_trace(
|
| 509 |
+
go.Scatter(x=method_data['Words'], y=method_data['Size'],
|
| 510 |
+
mode='markers', name=method, opacity=0.7),
|
| 511 |
+
row=2, col=2
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
fig.update_layout(height=800, showlegend=True)
|
| 515 |
+
fig.update_xaxes(tickangle=45)
|
| 516 |
+
|
| 517 |
+
st.plotly_chart(fig, width='stretch')
|
| 518 |
|
| 519 |
+
def main():
|
| 520 |
+
st.set_page_config(
|
| 521 |
+
page_title="Multi-Format RAG Chunk Visualizer",
|
| 522 |
+
page_icon="π",
|
| 523 |
+
layout="wide",
|
| 524 |
+
initial_sidebar_state="expanded"
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
col1, col2 = st.columns([3, 1])
|
| 528 |
+
with col1:
|
| 529 |
+
st.title("π Multi-Format RAG Chunk Visualizer")
|
| 530 |
+
st.markdown("**Professional chunking analysis with support for PDF, Excel, CSV, Word & Text files**")
|
| 531 |
+
|
| 532 |
+
with col2:
|
| 533 |
+
if st.button("βΉοΈ About", help="Learn about chunking strategies"):
|
| 534 |
+
with st.expander("Chunking Methods Explained", expanded=True):
|
| 535 |
+
st.markdown("""
|
| 536 |
+
**Fixed Size**: Splits text at character boundaries with word respect
|
| 537 |
+
**Sentence-based**: Groups sentences together for semantic coherence
|
| 538 |
+
**Paragraph-based**: Respects document structure and topic boundaries
|
| 539 |
+
**Recursive**: Hierarchical splitting using multiple separators
|
| 540 |
+
|
| 541 |
+
*Note: Semantic chunking disabled in this environment*
|
| 542 |
+
""")
|
| 543 |
+
|
| 544 |
+
visualizer = ProductionChunkVisualizer()
|
| 545 |
+
|
| 546 |
+
with st.sidebar:
|
| 547 |
+
st.header("βοΈ Configuration")
|
| 548 |
+
|
| 549 |
+
input_method = st.radio(
|
| 550 |
+
"Choose input method:",
|
| 551 |
+
["π Sample Text", "π Upload File", "βοΈ Custom Input"],
|
| 552 |
+
help="Select how you want to provide text for analysis"
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
sample_texts = {
|
| 556 |
+
"Research Paper Abstract": """Machine learning has fundamentally transformed the landscape of artificial intelligence research. Recent advances in deep learning architectures, particularly transformer-based models, have demonstrated unprecedented capabilities in natural language understanding and generation. These models leverage attention mechanisms to capture long-range dependencies in sequential data, enabling more sophisticated reasoning and contextual understanding. The implications extend beyond traditional NLP tasks to multimodal applications, including vision-language models and cross-modal reasoning systems. However, significant challenges remain in terms of computational efficiency, interpretability, and robustness to adversarial inputs.""",
|
| 557 |
+
|
| 558 |
+
"Technical Documentation": """Installation Prerequisites: Before beginning the installation process, ensure your system meets the following requirements. Python 3.8 or higher must be installed with pip package manager available. Node.js version 16.x or later is required for frontend dependencies. Git version control system should be accessible from command line.\n\nStep 1: Repository Setup\nClone the project repository using the following command: git clone https://github.com/company/rag-system.git. Navigate to the project directory and create a virtual environment: python -m venv rag-env. Activate the virtual environment using the appropriate command for your operating system.\n\nStep 2: Dependency Installation\nInstall Python dependencies by running pip install -r requirements.txt. This will install all necessary packages including transformers, sentence-transformers, and streamlit. For development dependencies, additionally run pip install -r requirements-dev.txt.""",
|
| 559 |
+
|
| 560 |
+
"Business Report": """Executive Summary: Q4 2024 Performance Analysis\n\nOur organization achieved exceptional growth in the fourth quarter of 2024, with revenue increasing by 42% year-over-year to reach $3.8 million. This growth was primarily driven by our expanded product portfolio and successful market penetration strategies in the enterprise segment.\n\nKey Performance Indicators demonstrate strong momentum across all business units. Customer acquisition costs decreased by 18% while customer lifetime value increased by 35%, indicating improved operational efficiency and customer satisfaction. Our newly launched AI-powered features contributed significantly to user engagement, with daily active users increasing by 67%.\n\nStrategic Initiatives for 2025 focus on international expansion and technology innovation. We plan to establish operations in three new markets: Germany, Japan, and Australia. Additionally, our R&D investment will increase by 50% to accelerate development of next-generation AI capabilities."""
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
if input_method == "π Sample Text":
|
| 564 |
+
selected_sample = st.selectbox("Select sample text:", list(sample_texts.keys()))
|
| 565 |
+
text = sample_texts[selected_sample]
|
| 566 |
+
st.text_area("Preview:", value=text[:200] + "...", height=100, disabled=True)
|
| 567 |
+
|
| 568 |
+
elif input_method == "π Upload File":
|
| 569 |
+
uploaded_file = st.file_uploader(
|
| 570 |
+
"Upload document",
|
| 571 |
+
type=['txt', 'pdf', 'csv', 'xlsx', 'xls', 'docx'],
|
| 572 |
+
help="Supports: TXT, PDF, CSV, Excel (XLSX/XLS), Word (DOCX)"
|
| 573 |
+
)
|
| 574 |
+
|
| 575 |
+
if uploaded_file:
|
| 576 |
+
file_type = uploaded_file.type
|
| 577 |
+
|
| 578 |
+
with st.spinner(f"Processing {uploaded_file.name}..."):
|
| 579 |
+
if file_type == "text/plain":
|
| 580 |
+
text = str(uploaded_file.read(), "utf-8")
|
| 581 |
+
elif file_type == "application/pdf":
|
| 582 |
+
text = visualizer.extract_text_from_pdf(uploaded_file)
|
| 583 |
+
elif file_type == "text/csv":
|
| 584 |
+
text = visualizer.extract_text_from_csv(uploaded_file)
|
| 585 |
+
elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 586 |
+
"application/vnd.ms-excel"]:
|
| 587 |
+
text = visualizer.extract_text_from_excel(uploaded_file)
|
| 588 |
+
elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
| 589 |
+
text = visualizer.extract_text_from_docx(uploaded_file)
|
| 590 |
+
else:
|
| 591 |
+
st.error(f"Unsupported file type: {file_type}")
|
| 592 |
+
text = sample_texts["Research Paper Abstract"]
|
| 593 |
+
|
| 594 |
+
if text and len(text.strip()) > 0:
|
| 595 |
+
st.success(f"β
Extracted {len(text)} characters from {uploaded_file.name}")
|
| 596 |
+
if len(text) > 1000:
|
| 597 |
+
st.text_area("Preview:", value=text[:500] + "...", height=100, disabled=True)
|
| 598 |
+
else:
|
| 599 |
+
st.error("No text could be extracted from the file")
|
| 600 |
+
text = sample_texts["Research Paper Abstract"]
|
| 601 |
+
else:
|
| 602 |
+
text = sample_texts["Research Paper Abstract"]
|
| 603 |
+
st.info("Using sample text until file is uploaded")
|
| 604 |
+
else:
|
| 605 |
+
text = st.text_area(
|
| 606 |
+
"Enter your text:",
|
| 607 |
+
height=200,
|
| 608 |
+
value=sample_texts["Business Document"],
|
| 609 |
+
help="Paste or type the text you want to analyze"
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
st.divider()
|
| 613 |
+
|
| 614 |
+
st.subheader("π§ Chunking Methods")
|
| 615 |
+
|
| 616 |
+
method_options = {
|
| 617 |
+
'Fixed Size': 'Character-based splitting with word boundaries',
|
| 618 |
+
'Sentence-based': 'Group by sentences for readability',
|
| 619 |
+
'Paragraph-based': 'Respect document structure',
|
| 620 |
+
'Recursive': 'Hierarchical splitting with multiple separators'
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
selected_methods = []
|
| 624 |
+
for method, description in method_options.items():
|
| 625 |
+
if st.checkbox(method, value=method in ['Fixed Size', 'Sentence-based'], help=description):
|
| 626 |
+
selected_methods.append(method)
|
| 627 |
+
|
| 628 |
+
if not selected_methods:
|
| 629 |
+
st.warning("β οΈ Select at least one chunking method")
|
| 630 |
+
|
| 631 |
+
st.divider()
|
| 632 |
+
|
| 633 |
+
st.subheader("βοΈ Parameters")
|
| 634 |
+
|
| 635 |
+
params = {}
|
| 636 |
+
|
| 637 |
+
if 'Fixed Size' in selected_methods:
|
| 638 |
+
st.markdown("**Fixed Size Settings**")
|
| 639 |
+
params['chunk_size'] = st.slider("Chunk size (characters)", 200, 2000, 800, step=50)
|
| 640 |
+
params['overlap'] = st.slider("Overlap (characters)", 0, 300, 100, step=25)
|
| 641 |
+
|
| 642 |
+
if 'Sentence-based' in selected_methods:
|
| 643 |
+
st.markdown("**Sentence-based Settings**")
|
| 644 |
+
params['sentences_per_chunk'] = st.slider("Sentences per chunk", 1, 10, 4)
|
| 645 |
+
|
| 646 |
+
if 'Recursive' in selected_methods:
|
| 647 |
+
st.markdown("**Recursive Settings**")
|
| 648 |
+
params['max_recursive_size'] = st.slider("Max chunk size", 500, 2000, 1200, step=100)
|
| 649 |
+
|
| 650 |
+
with st.expander("π¬ Advanced Options"):
|
| 651 |
+
show_overlap_analysis = st.checkbox("Show overlap analysis", value=True)
|
| 652 |
+
show_quality_metrics = st.checkbox("Show quality metrics", value=True)
|
| 653 |
+
export_results = st.checkbox("Enable result export", value=False)
|
| 654 |
+
|
| 655 |
+
if text and selected_methods:
|
| 656 |
+
with st.spinner("Processing chunks..."):
|
| 657 |
+
all_results = {}
|
| 658 |
+
|
| 659 |
+
for method in selected_methods:
|
| 660 |
+
if method == 'Fixed Size':
|
| 661 |
+
chunks = visualizer.fixed_size_chunking(
|
| 662 |
+
text, params.get('chunk_size', 800), params.get('overlap', 100)
|
| 663 |
+
)
|
| 664 |
+
elif method == 'Sentence-based':
|
| 665 |
+
chunks = visualizer.sentence_chunking(
|
| 666 |
+
text, params.get('sentences_per_chunk', 4)
|
| 667 |
+
)
|
| 668 |
+
elif method == 'Paragraph-based':
|
| 669 |
+
chunks = visualizer.paragraph_chunking(text)
|
| 670 |
+
elif method == 'Recursive':
|
| 671 |
+
chunks = visualizer.recursive_chunking(
|
| 672 |
+
text, params.get('max_recursive_size', 1200)
|
| 673 |
+
)
|
| 674 |
+
|
| 675 |
+
all_results[method] = chunks
|
| 676 |
+
|
| 677 |
+
st.success(f"β
Processed {len(text)} characters with {len(selected_methods)} methods")
|
| 678 |
+
|
| 679 |
+
tabs = st.tabs([f"π {method}" for method in selected_methods] + ["π Comparison"])
|
| 680 |
+
|
| 681 |
+
for i, (method, chunks) in enumerate(all_results.items()):
|
| 682 |
+
with tabs[i]:
|
| 683 |
+
metrics = visualizer.calculate_advanced_metrics(chunks)
|
| 684 |
+
|
| 685 |
+
col1, col2, col3, col4, col5 = st.columns(5)
|
| 686 |
+
with col1:
|
| 687 |
+
st.metric("Total Chunks", metrics.get('total_chunks', 0))
|
| 688 |
+
with col2:
|
| 689 |
+
st.metric("Avg Characters", f"{metrics.get('avg_chars', 0):.0f}")
|
| 690 |
+
with col3:
|
| 691 |
+
st.metric("Avg Words", f"{metrics.get('avg_words', 0):.0f}")
|
| 692 |
+
with col4:
|
| 693 |
+
st.metric("Consistency", f"{metrics.get('size_consistency', 0):.2f}")
|
| 694 |
+
with col5:
|
| 695 |
+
overlap_pct = metrics.get('overlap_ratio', 0) * 100
|
| 696 |
+
st.metric("Overlap", f"{overlap_pct:.1f}%")
|
| 697 |
+
|
| 698 |
+
visualizer.visualize_chunks_advanced(text, chunks)
|
| 699 |
+
|
| 700 |
+
if len(chunks) > 1:
|
| 701 |
+
sizes = [chunk['char_count'] for chunk in chunks]
|
| 702 |
+
fig = px.histogram(
|
| 703 |
+
x=sizes, nbins=min(20, len(chunks)),
|
| 704 |
+
title=f"{method} - Chunk Size Distribution",
|
| 705 |
+
labels={'x': 'Characters', 'y': 'Count'}
|
| 706 |
+
)
|
| 707 |
+
fig.update_layout(height=300)
|
| 708 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 709 |
+
|
| 710 |
+
with tabs[-1]:
|
| 711 |
+
st.header("π Comprehensive Analysis")
|
| 712 |
+
|
| 713 |
+
visualizer.create_comprehensive_charts(all_results)
|
| 714 |
+
|
| 715 |
+
st.subheader("π Detailed Metrics Comparison")
|
| 716 |
+
|
| 717 |
+
comparison_data = []
|
| 718 |
+
for method, chunks in all_results.items():
|
| 719 |
+
metrics = visualizer.calculate_advanced_metrics(chunks)
|
| 720 |
+
comparison_data.append({
|
| 721 |
+
'Method': method,
|
| 722 |
+
'Chunks': metrics.get('total_chunks', 0),
|
| 723 |
+
'Avg Size': f"{metrics.get('avg_chars', 0):.0f}",
|
| 724 |
+
'Size StdDev': f"{metrics.get('std_chars', 0):.0f}",
|
| 725 |
+
'Consistency': f"{metrics.get('size_consistency', 0):.3f}",
|
| 726 |
+
'Overlap %': f"{metrics.get('overlap_ratio', 0)*100:.1f}%"
|
| 727 |
+
})
|
| 728 |
+
|
| 729 |
+
df_comparison = pd.DataFrame(comparison_data)
|
| 730 |
+
st.dataframe(df_comparison, use_container_width=True)
|
| 731 |
+
|
| 732 |
+
st.subheader("π€ Intelligent Recommendations")
|
| 733 |
+
|
| 734 |
+
best_consistency = max(all_results.keys(),
|
| 735 |
+
key=lambda m: visualizer.calculate_advanced_metrics(all_results[m]).get('size_consistency', 0))
|
| 736 |
+
|
| 737 |
+
optimal_size_method = min(all_results.keys(),
|
| 738 |
+
key=lambda m: abs(visualizer.calculate_advanced_metrics(all_results[m]).get('avg_chars', 1000) - 600))
|
| 739 |
+
|
| 740 |
+
col1, col2 = st.columns(2)
|
| 741 |
+
|
| 742 |
+
with col1:
|
| 743 |
+
st.success(f"π― **Most Consistent**: {best_consistency}")
|
| 744 |
+
consistency_score = visualizer.calculate_advanced_metrics(all_results[best_consistency]).get('size_consistency', 0)
|
| 745 |
+
st.write(f"Consistency score: {consistency_score:.3f}")
|
| 746 |
+
|
| 747 |
+
with col2:
|
| 748 |
+
st.info(f"βοΈ **Optimal Size**: {optimal_size_method}")
|
| 749 |
+
avg_size = visualizer.calculate_advanced_metrics(all_results[optimal_size_method]).get('avg_chars', 0)
|
| 750 |
+
st.write(f"Average size: {avg_size:.0f} characters")
|
| 751 |
+
|
| 752 |
+
st.markdown("### π‘ Use Case Recommendations")
|
| 753 |
+
|
| 754 |
+
recommendations = {
|
| 755 |
+
"π **Search & Retrieval**": "Use Fixed Size (600-800 chars) for consistent embedding",
|
| 756 |
+
"π **Document Processing**": "Use Paragraph-based to preserve structure",
|
| 757 |
+
"π€ **LLM Input**": "Use Fixed Size (800-1200 chars) for token management",
|
| 758 |
+
"π **Reading Comprehension**": "Use Sentence-based for natural flow",
|
| 759 |
+
"π **Data Pipeline**": "Use Recursive for robust processing"
|
| 760 |
+
}
|
| 761 |
+
|
| 762 |
+
for use_case, recommendation in recommendations.items():
|
| 763 |
+
st.markdown(f"- {use_case}: {recommendation}")
|
| 764 |
+
|
| 765 |
+
if export_results:
|
| 766 |
+
st.subheader("π€ Export Results")
|
| 767 |
+
|
| 768 |
+
report_data = {
|
| 769 |
+
'text_length': len(text),
|
| 770 |
+
'methods_used': list(all_results.keys()),
|
| 771 |
+
'parameters': params,
|
| 772 |
+
'results': {}
|
| 773 |
+
}
|
| 774 |
+
|
| 775 |
+
for method, chunks in all_results.items():
|
| 776 |
+
metrics = visualizer.calculate_advanced_metrics(chunks)
|
| 777 |
+
report_data['results'][method] = {
|
| 778 |
+
'chunks': len(chunks),
|
| 779 |
+
'metrics': metrics,
|
| 780 |
+
'chunk_details': chunks
|
| 781 |
+
}
|
| 782 |
+
|
| 783 |
+
import json
|
| 784 |
+
report_json = json.dumps(report_data, indent=2, default=str)
|
| 785 |
+
|
| 786 |
+
col1, col2 = st.columns(2)
|
| 787 |
+
|
| 788 |
+
with col1:
|
| 789 |
+
st.download_button(
|
| 790 |
+
"π Download Analysis Report (JSON)",
|
| 791 |
+
data=report_json,
|
| 792 |
+
file_name=f"chunk_analysis_{len(text)}_chars.json",
|
| 793 |
+
mime="application/json"
|
| 794 |
+
)
|
| 795 |
+
|
| 796 |
+
with col2:
|
| 797 |
+
markdown_report = f"""# Multi-Format Chunk Analysis Report
|
| 798 |
+
|
| 799 |
+
## Text Analysis
|
| 800 |
+
- **Length**: {len(text):,} characters
|
| 801 |
+
- **Methods**: {', '.join(all_results.keys())}
|
| 802 |
+
- **Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}
|
| 803 |
|
| 804 |
+
## Results Summary
|
| 805 |
+
"""
|
| 806 |
+
|
| 807 |
+
for method, chunks in all_results.items():
|
| 808 |
+
metrics = visualizer.calculate_advanced_metrics(chunks)
|
| 809 |
+
markdown_report += f"""
|
| 810 |
+
### {method} Method
|
| 811 |
+
- **Chunks**: {metrics.get('total_chunks', 0)}
|
| 812 |
+
- **Average Size**: {metrics.get('avg_chars', 0):.0f} characters
|
| 813 |
+
- **Consistency**: {metrics.get('size_consistency', 0):.3f}
|
| 814 |
+
- **Overlap**: {metrics.get('overlap_ratio', 0)*100:.1f}%
|
| 815 |
+
"""
|
| 816 |
+
|
| 817 |
+
st.download_button(
|
| 818 |
+
"π Download Summary (Markdown)",
|
| 819 |
+
data=markdown_report,
|
| 820 |
+
file_name=f"chunk_summary_{len(text)}_chars.md",
|
| 821 |
+
mime="text/markdown"
|
| 822 |
+
)
|
| 823 |
|
| 824 |
+
else:
|
| 825 |
+
st.markdown("""
|
| 826 |
+
## π Welcome to the Multi-Format RAG Chunk Visualizer
|
| 827 |
+
|
| 828 |
+
This tool analyzes how different chunking strategies split your documents for RAG systems.
|
| 829 |
+
|
| 830 |
+
### π Supported File Formats
|
| 831 |
+
- **π PDF**: Research papers, reports, documentation
|
| 832 |
+
- **π Excel (XLSX/XLS)**: Spreadsheets, data tables, financial reports
|
| 833 |
+
- **π CSV**: Data exports, logs, structured datasets
|
| 834 |
+
- **π Word (DOCX)**: Business documents, proposals, manuscripts
|
| 835 |
+
- **π Text (TXT)**: Plain text files, code, notes
|
| 836 |
+
|
| 837 |
+
### π― Key Features
|
| 838 |
+
- **4 chunking strategies** with real-time comparison
|
| 839 |
+
- **Advanced metrics** including consistency and overlap analysis
|
| 840 |
+
- **Interactive visualizations** with detailed chunk inspection
|
| 841 |
+
- **Export capabilities** for team collaboration
|
| 842 |
+
- **Professional recommendations** for different use cases
|
| 843 |
+
|
| 844 |
+
### π‘ Quick Start
|
| 845 |
+
1. **Upload your file** or use sample text
|
| 846 |
+
2. **Select chunking methods** to compare (2-3 recommended)
|
| 847 |
+
3. **Adjust parameters** for each method
|
| 848 |
+
4. **Analyze results** with comprehensive metrics
|
| 849 |
+
|
| 850 |
+
### π§ Chunking Methods Available
|
| 851 |
+
- **Fixed Size**: Consistent character-based chunks with word boundaries
|
| 852 |
+
- **Sentence-based**: Natural language flow with sentence grouping
|
| 853 |
+
- **Paragraph-based**: Document structure preservation
|
| 854 |
+
- **Recursive**: Hierarchical splitting with multiple separators
|
| 855 |
+
|
| 856 |
+
**Note**: Semantic chunking temporarily disabled in this environment
|
| 857 |
+
|
| 858 |
+
Select your settings in the sidebar to begin analysis! π
|
| 859 |
+
""")
|
| 860 |
+
|
| 861 |
+
# Sample file format examples
|
| 862 |
+
st.subheader("π Example Use Cases")
|
| 863 |
+
|
| 864 |
+
col1, col2, col3 = st.columns(3)
|
| 865 |
+
|
| 866 |
+
with col1:
|
| 867 |
+
st.markdown("""
|
| 868 |
+
**π PDF Files**
|
| 869 |
+
- Research papers
|
| 870 |
+
- Technical manuals
|
| 871 |
+
- Legal documents
|
| 872 |
+
- Reports and presentations
|
| 873 |
+
""")
|
| 874 |
+
|
| 875 |
+
with col2:
|
| 876 |
+
st.markdown("""
|
| 877 |
+
**π Excel/CSV Files**
|
| 878 |
+
- Data tables
|
| 879 |
+
- Survey results
|
| 880 |
+
- Financial reports
|
| 881 |
+
- Product catalogs
|
| 882 |
+
""")
|
| 883 |
+
|
| 884 |
+
with col3:
|
| 885 |
+
st.markdown("""
|
| 886 |
+
**π Text/Word Files**
|
| 887 |
+
- Articles and blogs
|
| 888 |
+
- Meeting notes
|
| 889 |
+
- Technical documentation
|
| 890 |
+
- Business proposals
|
| 891 |
+
""")
|
| 892 |
|
| 893 |
+
if __name__ == "__main__":
|
| 894 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|