Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +202 -242
src/streamlit_app.py
CHANGED
|
@@ -8,15 +8,11 @@ import pandas as pd
|
|
| 8 |
import io
|
| 9 |
import time
|
| 10 |
from typing import List, Dict, Any
|
| 11 |
-
import PyPDF2
|
| 12 |
-
import openpyxl
|
| 13 |
-
from docx import Document
|
| 14 |
-
import csv
|
| 15 |
|
| 16 |
# Safe model loading without cache permission issues
|
| 17 |
@st.cache_resource
|
| 18 |
def load_sentence_transformer():
|
| 19 |
-
st.info("β οΈ Semantic chunking disabled in
|
| 20 |
return None
|
| 21 |
|
| 22 |
@st.cache_resource
|
|
@@ -29,12 +25,12 @@ def load_nltk():
|
|
| 29 |
try:
|
| 30 |
nltk.download('punkt', quiet=True)
|
| 31 |
except:
|
| 32 |
-
pass
|
| 33 |
return nltk
|
| 34 |
except ImportError:
|
| 35 |
return None
|
| 36 |
|
| 37 |
-
class
|
| 38 |
def __init__(self):
|
| 39 |
self.colors = [
|
| 40 |
'#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57',
|
|
@@ -54,7 +50,7 @@ class ProductionChunkVisualizer:
|
|
| 54 |
def extract_text_from_pdf(self, pdf_file):
|
| 55 |
"""Extract text from PDF file"""
|
| 56 |
try:
|
| 57 |
-
|
| 58 |
pdf_file.seek(0)
|
| 59 |
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 60 |
text = ""
|
|
@@ -70,21 +66,19 @@ class ProductionChunkVisualizer:
|
|
| 70 |
st.warning(f"Could not extract text from page {page_num + 1}: {str(e)}")
|
| 71 |
|
| 72 |
if not text.strip():
|
| 73 |
-
st.warning("PDF appears to be image-based or empty.
|
| 74 |
return "No extractable text found in PDF document."
|
| 75 |
|
| 76 |
return text.strip()
|
| 77 |
except Exception as e:
|
| 78 |
st.error(f"Error reading PDF: {str(e)}")
|
| 79 |
-
return
|
| 80 |
|
| 81 |
def extract_text_from_excel(self, excel_file):
|
| 82 |
"""Extract text from Excel file"""
|
| 83 |
try:
|
| 84 |
-
# Reset file pointer to beginning
|
| 85 |
excel_file.seek(0)
|
| 86 |
|
| 87 |
-
# Try different engines
|
| 88 |
try:
|
| 89 |
xl_data = pd.read_excel(excel_file, sheet_name=None, engine='openpyxl')
|
| 90 |
except:
|
|
@@ -101,13 +95,11 @@ class ProductionChunkVisualizer:
|
|
| 101 |
text += f"\n=== Sheet: {sheet_name} ===\n"
|
| 102 |
|
| 103 |
if not df.empty:
|
| 104 |
-
# Add column headers
|
| 105 |
headers = " | ".join(str(col) for col in df.columns)
|
| 106 |
text += f"Headers: {headers}\n"
|
| 107 |
text += "-" * 50 + "\n"
|
| 108 |
|
| 109 |
-
|
| 110 |
-
max_rows = min(100, len(df)) # Limit to 100 rows per sheet
|
| 111 |
for idx, row in df.head(max_rows).iterrows():
|
| 112 |
row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
|
| 113 |
text += row_text + "\n"
|
|
@@ -116,21 +108,18 @@ class ProductionChunkVisualizer:
|
|
| 116 |
text += f"... ({len(df) - max_rows} more rows)\n"
|
| 117 |
else:
|
| 118 |
text += "Empty sheet\n"
|
| 119 |
-
|
| 120 |
text += "\n"
|
| 121 |
|
| 122 |
return text.strip()
|
| 123 |
except Exception as e:
|
| 124 |
st.error(f"Error reading Excel file: {str(e)}")
|
| 125 |
-
return
|
| 126 |
|
| 127 |
def extract_text_from_csv(self, csv_file):
|
| 128 |
"""Extract text from CSV file"""
|
| 129 |
try:
|
| 130 |
-
# Reset file pointer to beginning
|
| 131 |
csv_file.seek(0)
|
| 132 |
|
| 133 |
-
# Try different encodings
|
| 134 |
for encoding in ['utf-8', 'latin-1', 'cp1252']:
|
| 135 |
try:
|
| 136 |
csv_file.seek(0)
|
|
@@ -139,20 +128,18 @@ class ProductionChunkVisualizer:
|
|
| 139 |
except UnicodeDecodeError:
|
| 140 |
continue
|
| 141 |
else:
|
| 142 |
-
df = pd.read_csv(csv_file)
|
| 143 |
|
| 144 |
if df.empty:
|
| 145 |
return "Empty CSV file"
|
| 146 |
|
| 147 |
st.write(f"π Processing CSV with {len(df)} rows and {len(df.columns)} columns...")
|
| 148 |
|
| 149 |
-
# Create readable text format
|
| 150 |
text = "=== CSV Data ===\n"
|
| 151 |
headers = " | ".join(str(col) for col in df.columns)
|
| 152 |
text += f"Headers: {headers}\n"
|
| 153 |
text += "-" * 50 + "\n"
|
| 154 |
|
| 155 |
-
# Limit rows to prevent massive output
|
| 156 |
max_rows = min(100, len(df))
|
| 157 |
for _, row in df.head(max_rows).iterrows():
|
| 158 |
row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
|
|
@@ -164,11 +151,13 @@ class ProductionChunkVisualizer:
|
|
| 164 |
return text.strip()
|
| 165 |
except Exception as e:
|
| 166 |
st.error(f"Error reading CSV file: {str(e)}")
|
| 167 |
-
return
|
| 168 |
|
| 169 |
def extract_text_from_docx(self, docx_file):
|
| 170 |
"""Extract text from Word document"""
|
| 171 |
try:
|
|
|
|
|
|
|
| 172 |
doc = Document(docx_file)
|
| 173 |
text = ""
|
| 174 |
|
|
@@ -176,7 +165,6 @@ class ProductionChunkVisualizer:
|
|
| 176 |
if paragraph.text.strip():
|
| 177 |
text += paragraph.text + "\n"
|
| 178 |
|
| 179 |
-
# Also extract text from tables
|
| 180 |
for table in doc.tables:
|
| 181 |
text += "\n=== Table ===\n"
|
| 182 |
for row in table.rows:
|
|
@@ -215,7 +203,6 @@ class ProductionChunkVisualizer:
|
|
| 215 |
chunk = text[start:]
|
| 216 |
else:
|
| 217 |
chunk = text[start:end]
|
| 218 |
-
# Find last complete word
|
| 219 |
if not text[end].isspace():
|
| 220 |
last_space = chunk.rfind(' ')
|
| 221 |
if last_space > chunk_size * 0.7:
|
|
@@ -248,7 +235,6 @@ class ProductionChunkVisualizer:
|
|
| 248 |
chunk_sentences = sentences[i:i + sentences_per_chunk]
|
| 249 |
chunk_text = ' '.join(chunk_sentences)
|
| 250 |
|
| 251 |
-
# Find actual position in original text
|
| 252 |
start_pos = text.find(chunk_sentences[0], current_pos)
|
| 253 |
if start_pos == -1:
|
| 254 |
start_pos = current_pos
|
|
@@ -297,11 +283,6 @@ class ProductionChunkVisualizer:
|
|
| 297 |
|
| 298 |
return chunks
|
| 299 |
|
| 300 |
-
def semantic_chunking(self, text: str, similarity_threshold: float = 0.5) -> List[Dict]:
|
| 301 |
-
"""Disabled semantic chunking - fallback to sentence-based"""
|
| 302 |
-
st.warning("Semantic chunking unavailable in this environment. Using sentence-based fallback.")
|
| 303 |
-
return self.sentence_chunking(text, 3)
|
| 304 |
-
|
| 305 |
def recursive_chunking(self, text: str, max_chunk_size: int = 1000) -> List[Dict]:
|
| 306 |
"""Hierarchical text splitting with multiple separators"""
|
| 307 |
separators = ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " "]
|
|
@@ -368,7 +349,7 @@ class ProductionChunkVisualizer:
|
|
| 368 |
|
| 369 |
return chunks
|
| 370 |
|
| 371 |
-
def
|
| 372 |
"""Calculate comprehensive chunk metrics"""
|
| 373 |
if not chunks:
|
| 374 |
return {}
|
|
@@ -384,7 +365,6 @@ class ProductionChunkVisualizer:
|
|
| 384 |
overlap_ratio = max(0, (total_chars - text_length) / text_length)
|
| 385 |
|
| 386 |
char_cv = np.std(char_counts) / np.mean(char_counts) if np.mean(char_counts) > 0 else 0
|
| 387 |
-
word_cv = np.std(word_counts) / np.mean(word_counts) if np.mean(word_counts) > 0 else 0
|
| 388 |
|
| 389 |
return {
|
| 390 |
'total_chunks': len(chunks),
|
|
@@ -395,25 +375,22 @@ class ProductionChunkVisualizer:
|
|
| 395 |
'avg_words': np.mean(word_counts),
|
| 396 |
'std_words': np.std(word_counts),
|
| 397 |
'char_cv': char_cv,
|
| 398 |
-
'word_cv': word_cv,
|
| 399 |
'overlap_ratio': overlap_ratio,
|
| 400 |
'size_consistency': 1 - char_cv,
|
| 401 |
'total_coverage': sum(chunk['end'] - chunk['start'] for chunk in chunks)
|
| 402 |
}
|
| 403 |
|
| 404 |
-
def
|
| 405 |
-
"""
|
| 406 |
if not chunks:
|
| 407 |
st.write("No chunks to display")
|
| 408 |
return
|
| 409 |
|
| 410 |
-
st.markdown("### π¨
|
| 411 |
|
| 412 |
for i, chunk in enumerate(chunks):
|
| 413 |
color = self.colors[i % len(self.colors)]
|
| 414 |
|
| 415 |
-
words_per_sentence = chunk['word_count'] / max(1, chunk.get('sentence_count', 1))
|
| 416 |
-
|
| 417 |
st.markdown(f"""
|
| 418 |
<div style='background: linear-gradient(135deg, {color}15, {color}25);
|
| 419 |
border-left: 5px solid {color};
|
|
@@ -432,13 +409,10 @@ class ProductionChunkVisualizer:
|
|
| 432 |
<div style='color: #333; line-height: 1.6; font-size: 14px;'>
|
| 433 |
{chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}
|
| 434 |
</div>
|
| 435 |
-
<div style='margin-top: 8px; color: #888; font-size: 11px;'>
|
| 436 |
-
Quality: {words_per_sentence:.1f} words/sentence
|
| 437 |
-
</div>
|
| 438 |
</div>
|
| 439 |
""", unsafe_allow_html=True)
|
| 440 |
|
| 441 |
-
def
|
| 442 |
"""Create detailed analysis charts"""
|
| 443 |
if not all_results:
|
| 444 |
return
|
|
@@ -447,7 +421,7 @@ class ProductionChunkVisualizer:
|
|
| 447 |
size_data = []
|
| 448 |
|
| 449 |
for method, chunks in all_results.items():
|
| 450 |
-
metrics = self.
|
| 451 |
metrics_data.append({
|
| 452 |
'Method': method,
|
| 453 |
'Chunks': metrics.get('total_chunks', 0),
|
|
@@ -518,16 +492,17 @@ class ProductionChunkVisualizer:
|
|
| 518 |
|
| 519 |
def main():
|
| 520 |
st.set_page_config(
|
| 521 |
-
page_title="
|
| 522 |
page_icon="π",
|
| 523 |
layout="wide",
|
| 524 |
initial_sidebar_state="expanded"
|
| 525 |
)
|
| 526 |
|
|
|
|
| 527 |
col1, col2 = st.columns([3, 1])
|
| 528 |
with col1:
|
| 529 |
-
st.title("π
|
| 530 |
-
st.markdown("**Professional chunking analysis
|
| 531 |
|
| 532 |
with col2:
|
| 533 |
if st.button("βΉοΈ About", help="Learn about chunking strategies"):
|
|
@@ -537,122 +512,146 @@ def main():
|
|
| 537 |
**Sentence-based**: Groups sentences together for semantic coherence
|
| 538 |
**Paragraph-based**: Respects document structure and topic boundaries
|
| 539 |
**Recursive**: Hierarchical splitting using multiple separators
|
| 540 |
-
|
| 541 |
-
*Note: Semantic chunking disabled in this environment*
|
| 542 |
""")
|
| 543 |
|
| 544 |
-
visualizer =
|
| 545 |
|
|
|
|
| 546 |
with st.sidebar:
|
| 547 |
st.header("βοΈ Configuration")
|
| 548 |
|
|
|
|
| 549 |
input_method = st.radio(
|
| 550 |
"Choose input method:",
|
| 551 |
-
["
|
| 552 |
help="Select how you want to provide text for analysis"
|
| 553 |
)
|
| 554 |
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
"Technical Documentation": """Installation Prerequisites: Before beginning the installation process, ensure your system meets the following requirements. Python 3.8 or higher must be installed with pip package manager available. Node.js version 16.x or later is required for frontend dependencies. Git version control system should be accessible from command line.\n\nStep 1: Repository Setup\nClone the project repository using the following command: git clone https://github.com/company/rag-system.git. Navigate to the project directory and create a virtual environment: python -m venv rag-env. Activate the virtual environment using the appropriate command for your operating system.\n\nStep 2: Dependency Installation\nInstall Python dependencies by running pip install -r requirements.txt. This will install all necessary packages including transformers, sentence-transformers, and streamlit. For development dependencies, additionally run pip install -r requirements-dev.txt.""",
|
| 559 |
-
|
| 560 |
-
"Business Report": """Executive Summary: Q4 2024 Performance Analysis\n\nOur organization achieved exceptional growth in the fourth quarter of 2024, with revenue increasing by 42% year-over-year to reach $3.8 million. This growth was primarily driven by our expanded product portfolio and successful market penetration strategies in the enterprise segment.\n\nKey Performance Indicators demonstrate strong momentum across all business units. Customer acquisition costs decreased by 18% while customer lifetime value increased by 35%, indicating improved operational efficiency and customer satisfaction. Our newly launched AI-powered features contributed significantly to user engagement, with daily active users increasing by 67%.\n\nStrategic Initiatives for 2025 focus on international expansion and technology innovation. We plan to establish operations in three new markets: Germany, Japan, and Australia. Additionally, our R&D investment will increase by 50% to accelerate development of next-generation AI capabilities."""
|
| 561 |
-
}
|
| 562 |
|
| 563 |
-
if input_method == "
|
| 564 |
-
|
| 565 |
-
text = sample_texts[selected_sample]
|
| 566 |
-
st.text_area("Preview:", value=text[:200] + "...", height=100, disabled=True)
|
| 567 |
|
| 568 |
-
elif input_method == "π Upload File":
|
| 569 |
uploaded_file = st.file_uploader(
|
| 570 |
-
"
|
| 571 |
type=['txt', 'pdf', 'csv', 'xlsx', 'xls', 'docx'],
|
| 572 |
help="Supports: TXT, PDF, CSV, Excel (XLSX/XLS), Word (DOCX)"
|
| 573 |
)
|
| 574 |
|
| 575 |
-
if uploaded_file:
|
| 576 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
|
| 578 |
with st.spinner(f"Processing {uploaded_file.name}..."):
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
|
|
|
| 594 |
if text and len(text.strip()) > 0:
|
| 595 |
-
st.success(f"β
Extracted {len(text)} characters
|
| 596 |
-
|
| 597 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
else:
|
| 599 |
-
st.error("No text could be extracted from the file")
|
| 600 |
-
text = sample_texts["Research Paper Abstract"]
|
| 601 |
else:
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
else:
|
| 605 |
text = st.text_area(
|
| 606 |
"Enter your text:",
|
| 607 |
height=200,
|
| 608 |
-
|
| 609 |
help="Paste or type the text you want to analyze"
|
| 610 |
)
|
| 611 |
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
params
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
|
|
|
|
|
|
| 654 |
|
| 655 |
-
|
|
|
|
|
|
|
| 656 |
with st.spinner("Processing chunks..."):
|
| 657 |
all_results = {}
|
| 658 |
|
|
@@ -674,14 +673,17 @@ def main():
|
|
| 674 |
|
| 675 |
all_results[method] = chunks
|
| 676 |
|
| 677 |
-
st.success(f"β
Processed {len(text)} characters with {len(selected_methods)} methods")
|
| 678 |
|
|
|
|
| 679 |
tabs = st.tabs([f"π {method}" for method in selected_methods] + ["π Comparison"])
|
| 680 |
|
|
|
|
| 681 |
for i, (method, chunks) in enumerate(all_results.items()):
|
| 682 |
with tabs[i]:
|
| 683 |
-
metrics = visualizer.
|
| 684 |
|
|
|
|
| 685 |
col1, col2, col3, col4, col5 = st.columns(5)
|
| 686 |
with col1:
|
| 687 |
st.metric("Total Chunks", metrics.get('total_chunks', 0))
|
|
@@ -695,8 +697,10 @@ def main():
|
|
| 695 |
overlap_pct = metrics.get('overlap_ratio', 0) * 100
|
| 696 |
st.metric("Overlap", f"{overlap_pct:.1f}%")
|
| 697 |
|
| 698 |
-
|
|
|
|
| 699 |
|
|
|
|
| 700 |
if len(chunks) > 1:
|
| 701 |
sizes = [chunk['char_count'] for chunk in chunks]
|
| 702 |
fig = px.histogram(
|
|
@@ -705,18 +709,21 @@ def main():
|
|
| 705 |
labels={'x': 'Characters', 'y': 'Count'}
|
| 706 |
)
|
| 707 |
fig.update_layout(height=300)
|
| 708 |
-
st.plotly_chart(fig,
|
| 709 |
|
|
|
|
| 710 |
with tabs[-1]:
|
| 711 |
st.header("π Comprehensive Analysis")
|
| 712 |
|
| 713 |
-
|
|
|
|
| 714 |
|
|
|
|
| 715 |
st.subheader("π Detailed Metrics Comparison")
|
| 716 |
|
| 717 |
comparison_data = []
|
| 718 |
for method, chunks in all_results.items():
|
| 719 |
-
metrics = visualizer.
|
| 720 |
comparison_data.append({
|
| 721 |
'Method': method,
|
| 722 |
'Chunks': metrics.get('total_chunks', 0),
|
|
@@ -727,28 +734,30 @@ def main():
|
|
| 727 |
})
|
| 728 |
|
| 729 |
df_comparison = pd.DataFrame(comparison_data)
|
| 730 |
-
st.dataframe(df_comparison,
|
| 731 |
|
| 732 |
-
|
|
|
|
| 733 |
|
| 734 |
best_consistency = max(all_results.keys(),
|
| 735 |
-
key=lambda m: visualizer.
|
| 736 |
|
| 737 |
optimal_size_method = min(all_results.keys(),
|
| 738 |
-
key=lambda m: abs(visualizer.
|
| 739 |
|
| 740 |
col1, col2 = st.columns(2)
|
| 741 |
|
| 742 |
with col1:
|
| 743 |
st.success(f"π― **Most Consistent**: {best_consistency}")
|
| 744 |
-
consistency_score = visualizer.
|
| 745 |
st.write(f"Consistency score: {consistency_score:.3f}")
|
| 746 |
|
| 747 |
with col2:
|
| 748 |
st.info(f"βοΈ **Optimal Size**: {optimal_size_method}")
|
| 749 |
-
avg_size = visualizer.
|
| 750 |
st.write(f"Average size: {avg_size:.0f} characters")
|
| 751 |
|
|
|
|
| 752 |
st.markdown("### π‘ Use Case Recommendations")
|
| 753 |
|
| 754 |
recommendations = {
|
|
@@ -761,133 +770,84 @@ def main():
|
|
| 761 |
|
| 762 |
for use_case, recommendation in recommendations.items():
|
| 763 |
st.markdown(f"- {use_case}: {recommendation}")
|
| 764 |
-
|
| 765 |
-
if export_results:
|
| 766 |
-
st.subheader("π€ Export Results")
|
| 767 |
-
|
| 768 |
-
report_data = {
|
| 769 |
-
'text_length': len(text),
|
| 770 |
-
'methods_used': list(all_results.keys()),
|
| 771 |
-
'parameters': params,
|
| 772 |
-
'results': {}
|
| 773 |
-
}
|
| 774 |
-
|
| 775 |
-
for method, chunks in all_results.items():
|
| 776 |
-
metrics = visualizer.calculate_advanced_metrics(chunks)
|
| 777 |
-
report_data['results'][method] = {
|
| 778 |
-
'chunks': len(chunks),
|
| 779 |
-
'metrics': metrics,
|
| 780 |
-
'chunk_details': chunks
|
| 781 |
-
}
|
| 782 |
-
|
| 783 |
-
import json
|
| 784 |
-
report_json = json.dumps(report_data, indent=2, default=str)
|
| 785 |
-
|
| 786 |
-
col1, col2 = st.columns(2)
|
| 787 |
-
|
| 788 |
-
with col1:
|
| 789 |
-
st.download_button(
|
| 790 |
-
"π Download Analysis Report (JSON)",
|
| 791 |
-
data=report_json,
|
| 792 |
-
file_name=f"chunk_analysis_{len(text)}_chars.json",
|
| 793 |
-
mime="application/json"
|
| 794 |
-
)
|
| 795 |
-
|
| 796 |
-
with col2:
|
| 797 |
-
markdown_report = f"""# Multi-Format Chunk Analysis Report
|
| 798 |
-
|
| 799 |
-
## Text Analysis
|
| 800 |
-
- **Length**: {len(text):,} characters
|
| 801 |
-
- **Methods**: {', '.join(all_results.keys())}
|
| 802 |
-
- **Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}
|
| 803 |
-
|
| 804 |
-
## Results Summary
|
| 805 |
-
"""
|
| 806 |
-
|
| 807 |
-
for method, chunks in all_results.items():
|
| 808 |
-
metrics = visualizer.calculate_advanced_metrics(chunks)
|
| 809 |
-
markdown_report += f"""
|
| 810 |
-
### {method} Method
|
| 811 |
-
- **Chunks**: {metrics.get('total_chunks', 0)}
|
| 812 |
-
- **Average Size**: {metrics.get('avg_chars', 0):.0f} characters
|
| 813 |
-
- **Consistency**: {metrics.get('size_consistency', 0):.3f}
|
| 814 |
-
- **Overlap**: {metrics.get('overlap_ratio', 0)*100:.1f}%
|
| 815 |
-
"""
|
| 816 |
-
|
| 817 |
-
st.download_button(
|
| 818 |
-
"π Download Summary (Markdown)",
|
| 819 |
-
data=markdown_report,
|
| 820 |
-
file_name=f"chunk_summary_{len(text)}_chars.md",
|
| 821 |
-
mime="text/markdown"
|
| 822 |
-
)
|
| 823 |
|
| 824 |
else:
|
|
|
|
| 825 |
st.markdown("""
|
| 826 |
-
## π Welcome to the
|
| 827 |
|
| 828 |
This tool analyzes how different chunking strategies split your documents for RAG systems.
|
| 829 |
|
| 830 |
-
### π
|
| 831 |
-
- **π PDF**: Research papers, reports, documentation
|
| 832 |
-
- **π Excel (XLSX/XLS)**: Spreadsheets, data tables, financial reports
|
| 833 |
-
- **π CSV**: Data exports, logs, structured datasets
|
| 834 |
-
- **π Word (DOCX)**: Business documents, proposals, manuscripts
|
| 835 |
-
- **π Text (TXT)**: Plain text files, code, notes
|
| 836 |
|
| 837 |
-
|
| 838 |
-
- **
|
| 839 |
-
- **
|
| 840 |
-
- **Interactive visualizations** with detailed chunk inspection
|
| 841 |
-
- **Export capabilities** for team collaboration
|
| 842 |
-
- **Professional recommendations** for different use cases
|
| 843 |
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
|
|
|
|
|
|
| 849 |
|
| 850 |
-
### π§ Chunking Methods Available
|
| 851 |
- **Fixed Size**: Consistent character-based chunks with word boundaries
|
| 852 |
-
- **Sentence-based**: Natural language flow with sentence grouping
|
| 853 |
- **Paragraph-based**: Document structure preservation
|
| 854 |
- **Recursive**: Hierarchical splitting with multiple separators
|
| 855 |
|
| 856 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 857 |
|
| 858 |
-
Select your
|
| 859 |
""")
|
| 860 |
|
| 861 |
-
#
|
| 862 |
-
st.subheader("
|
| 863 |
|
| 864 |
col1, col2, col3 = st.columns(3)
|
| 865 |
|
| 866 |
with col1:
|
| 867 |
st.markdown("""
|
| 868 |
-
**
|
| 869 |
-
-
|
| 870 |
-
-
|
| 871 |
-
-
|
| 872 |
-
-
|
| 873 |
""")
|
| 874 |
|
| 875 |
with col2:
|
| 876 |
st.markdown("""
|
| 877 |
-
**
|
| 878 |
-
-
|
| 879 |
-
-
|
| 880 |
-
-
|
| 881 |
-
-
|
| 882 |
""")
|
| 883 |
|
| 884 |
with col3:
|
| 885 |
st.markdown("""
|
| 886 |
-
**
|
| 887 |
-
-
|
| 888 |
-
-
|
| 889 |
-
-
|
| 890 |
-
-
|
| 891 |
""")
|
| 892 |
|
| 893 |
if __name__ == "__main__":
|
|
|
|
| 8 |
import io
|
| 9 |
import time
|
| 10 |
from typing import List, Dict, Any
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Safe model loading without cache permission issues
|
| 13 |
@st.cache_resource
|
| 14 |
def load_sentence_transformer():
|
| 15 |
+
st.info("β οΈ Semantic chunking disabled in this environment")
|
| 16 |
return None
|
| 17 |
|
| 18 |
@st.cache_resource
|
|
|
|
| 25 |
try:
|
| 26 |
nltk.download('punkt', quiet=True)
|
| 27 |
except:
|
| 28 |
+
pass
|
| 29 |
return nltk
|
| 30 |
except ImportError:
|
| 31 |
return None
|
| 32 |
|
| 33 |
+
class ChunkVisualizer:
|
| 34 |
def __init__(self):
|
| 35 |
self.colors = [
|
| 36 |
'#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57',
|
|
|
|
| 50 |
def extract_text_from_pdf(self, pdf_file):
|
| 51 |
"""Extract text from PDF file"""
|
| 52 |
try:
|
| 53 |
+
import PyPDF2
|
| 54 |
pdf_file.seek(0)
|
| 55 |
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 56 |
text = ""
|
|
|
|
| 66 |
st.warning(f"Could not extract text from page {page_num + 1}: {str(e)}")
|
| 67 |
|
| 68 |
if not text.strip():
|
| 69 |
+
st.warning("PDF appears to be image-based or empty.")
|
| 70 |
return "No extractable text found in PDF document."
|
| 71 |
|
| 72 |
return text.strip()
|
| 73 |
except Exception as e:
|
| 74 |
st.error(f"Error reading PDF: {str(e)}")
|
| 75 |
+
return ""
|
| 76 |
|
| 77 |
def extract_text_from_excel(self, excel_file):
|
| 78 |
"""Extract text from Excel file"""
|
| 79 |
try:
|
|
|
|
| 80 |
excel_file.seek(0)
|
| 81 |
|
|
|
|
| 82 |
try:
|
| 83 |
xl_data = pd.read_excel(excel_file, sheet_name=None, engine='openpyxl')
|
| 84 |
except:
|
|
|
|
| 95 |
text += f"\n=== Sheet: {sheet_name} ===\n"
|
| 96 |
|
| 97 |
if not df.empty:
|
|
|
|
| 98 |
headers = " | ".join(str(col) for col in df.columns)
|
| 99 |
text += f"Headers: {headers}\n"
|
| 100 |
text += "-" * 50 + "\n"
|
| 101 |
|
| 102 |
+
max_rows = min(100, len(df))
|
|
|
|
| 103 |
for idx, row in df.head(max_rows).iterrows():
|
| 104 |
row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
|
| 105 |
text += row_text + "\n"
|
|
|
|
| 108 |
text += f"... ({len(df) - max_rows} more rows)\n"
|
| 109 |
else:
|
| 110 |
text += "Empty sheet\n"
|
|
|
|
| 111 |
text += "\n"
|
| 112 |
|
| 113 |
return text.strip()
|
| 114 |
except Exception as e:
|
| 115 |
st.error(f"Error reading Excel file: {str(e)}")
|
| 116 |
+
return ""
|
| 117 |
|
| 118 |
def extract_text_from_csv(self, csv_file):
|
| 119 |
"""Extract text from CSV file"""
|
| 120 |
try:
|
|
|
|
| 121 |
csv_file.seek(0)
|
| 122 |
|
|
|
|
| 123 |
for encoding in ['utf-8', 'latin-1', 'cp1252']:
|
| 124 |
try:
|
| 125 |
csv_file.seek(0)
|
|
|
|
| 128 |
except UnicodeDecodeError:
|
| 129 |
continue
|
| 130 |
else:
|
| 131 |
+
df = pd.read_csv(csv_file)
|
| 132 |
|
| 133 |
if df.empty:
|
| 134 |
return "Empty CSV file"
|
| 135 |
|
| 136 |
st.write(f"π Processing CSV with {len(df)} rows and {len(df.columns)} columns...")
|
| 137 |
|
|
|
|
| 138 |
text = "=== CSV Data ===\n"
|
| 139 |
headers = " | ".join(str(col) for col in df.columns)
|
| 140 |
text += f"Headers: {headers}\n"
|
| 141 |
text += "-" * 50 + "\n"
|
| 142 |
|
|
|
|
| 143 |
max_rows = min(100, len(df))
|
| 144 |
for _, row in df.head(max_rows).iterrows():
|
| 145 |
row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
|
|
|
|
| 151 |
return text.strip()
|
| 152 |
except Exception as e:
|
| 153 |
st.error(f"Error reading CSV file: {str(e)}")
|
| 154 |
+
return ""
|
| 155 |
|
| 156 |
def extract_text_from_docx(self, docx_file):
|
| 157 |
"""Extract text from Word document"""
|
| 158 |
try:
|
| 159 |
+
from docx import Document
|
| 160 |
+
docx_file.seek(0)
|
| 161 |
doc = Document(docx_file)
|
| 162 |
text = ""
|
| 163 |
|
|
|
|
| 165 |
if paragraph.text.strip():
|
| 166 |
text += paragraph.text + "\n"
|
| 167 |
|
|
|
|
| 168 |
for table in doc.tables:
|
| 169 |
text += "\n=== Table ===\n"
|
| 170 |
for row in table.rows:
|
|
|
|
| 203 |
chunk = text[start:]
|
| 204 |
else:
|
| 205 |
chunk = text[start:end]
|
|
|
|
| 206 |
if not text[end].isspace():
|
| 207 |
last_space = chunk.rfind(' ')
|
| 208 |
if last_space > chunk_size * 0.7:
|
|
|
|
| 235 |
chunk_sentences = sentences[i:i + sentences_per_chunk]
|
| 236 |
chunk_text = ' '.join(chunk_sentences)
|
| 237 |
|
|
|
|
| 238 |
start_pos = text.find(chunk_sentences[0], current_pos)
|
| 239 |
if start_pos == -1:
|
| 240 |
start_pos = current_pos
|
|
|
|
| 283 |
|
| 284 |
return chunks
|
| 285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
def recursive_chunking(self, text: str, max_chunk_size: int = 1000) -> List[Dict]:
|
| 287 |
"""Hierarchical text splitting with multiple separators"""
|
| 288 |
separators = ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " "]
|
|
|
|
| 349 |
|
| 350 |
return chunks
|
| 351 |
|
| 352 |
+
def calculate_metrics(self, chunks: List[Dict]) -> Dict[str, Any]:
|
| 353 |
"""Calculate comprehensive chunk metrics"""
|
| 354 |
if not chunks:
|
| 355 |
return {}
|
|
|
|
| 365 |
overlap_ratio = max(0, (total_chars - text_length) / text_length)
|
| 366 |
|
| 367 |
char_cv = np.std(char_counts) / np.mean(char_counts) if np.mean(char_counts) > 0 else 0
|
|
|
|
| 368 |
|
| 369 |
return {
|
| 370 |
'total_chunks': len(chunks),
|
|
|
|
| 375 |
'avg_words': np.mean(word_counts),
|
| 376 |
'std_words': np.std(word_counts),
|
| 377 |
'char_cv': char_cv,
|
|
|
|
| 378 |
'overlap_ratio': overlap_ratio,
|
| 379 |
'size_consistency': 1 - char_cv,
|
| 380 |
'total_coverage': sum(chunk['end'] - chunk['start'] for chunk in chunks)
|
| 381 |
}
|
| 382 |
|
| 383 |
+
def visualize_chunks(self, chunks: List[Dict]):
|
| 384 |
+
"""Display chunks with color coding"""
|
| 385 |
if not chunks:
|
| 386 |
st.write("No chunks to display")
|
| 387 |
return
|
| 388 |
|
| 389 |
+
st.markdown("### π¨ Chunk Visualization")
|
| 390 |
|
| 391 |
for i, chunk in enumerate(chunks):
|
| 392 |
color = self.colors[i % len(self.colors)]
|
| 393 |
|
|
|
|
|
|
|
| 394 |
st.markdown(f"""
|
| 395 |
<div style='background: linear-gradient(135deg, {color}15, {color}25);
|
| 396 |
border-left: 5px solid {color};
|
|
|
|
| 409 |
<div style='color: #333; line-height: 1.6; font-size: 14px;'>
|
| 410 |
{chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}
|
| 411 |
</div>
|
|
|
|
|
|
|
|
|
|
| 412 |
</div>
|
| 413 |
""", unsafe_allow_html=True)
|
| 414 |
|
| 415 |
+
def create_comparison_charts(self, all_results: Dict[str, List[Dict]]):
|
| 416 |
"""Create detailed analysis charts"""
|
| 417 |
if not all_results:
|
| 418 |
return
|
|
|
|
| 421 |
size_data = []
|
| 422 |
|
| 423 |
for method, chunks in all_results.items():
|
| 424 |
+
metrics = self.calculate_metrics(chunks)
|
| 425 |
metrics_data.append({
|
| 426 |
'Method': method,
|
| 427 |
'Chunks': metrics.get('total_chunks', 0),
|
|
|
|
| 492 |
|
| 493 |
def main():
|
| 494 |
st.set_page_config(
|
| 495 |
+
page_title="RAG Chunk Visualizer",
|
| 496 |
page_icon="π",
|
| 497 |
layout="wide",
|
| 498 |
initial_sidebar_state="expanded"
|
| 499 |
)
|
| 500 |
|
| 501 |
+
# Header
|
| 502 |
col1, col2 = st.columns([3, 1])
|
| 503 |
with col1:
|
| 504 |
+
st.title("π RAG Chunk Visualizer")
|
| 505 |
+
st.markdown("**Professional chunking analysis for RAG systems**")
|
| 506 |
|
| 507 |
with col2:
|
| 508 |
if st.button("βΉοΈ About", help="Learn about chunking strategies"):
|
|
|
|
| 512 |
**Sentence-based**: Groups sentences together for semantic coherence
|
| 513 |
**Paragraph-based**: Respects document structure and topic boundaries
|
| 514 |
**Recursive**: Hierarchical splitting using multiple separators
|
|
|
|
|
|
|
| 515 |
""")
|
| 516 |
|
| 517 |
+
visualizer = ChunkVisualizer()
|
| 518 |
|
| 519 |
+
# Sidebar for configuration
|
| 520 |
with st.sidebar:
|
| 521 |
st.header("βοΈ Configuration")
|
| 522 |
|
| 523 |
+
# Input method selection
|
| 524 |
input_method = st.radio(
|
| 525 |
"Choose input method:",
|
| 526 |
+
["π Upload File", "βοΈ Custom Input"],
|
| 527 |
help="Select how you want to provide text for analysis"
|
| 528 |
)
|
| 529 |
|
| 530 |
+
# File upload or text input
|
| 531 |
+
text = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
|
| 533 |
+
if input_method == "π Upload File":
|
| 534 |
+
st.markdown("**File Upload**")
|
|
|
|
|
|
|
| 535 |
|
|
|
|
| 536 |
uploaded_file = st.file_uploader(
|
| 537 |
+
"Choose a file",
|
| 538 |
type=['txt', 'pdf', 'csv', 'xlsx', 'xls', 'docx'],
|
| 539 |
help="Supports: TXT, PDF, CSV, Excel (XLSX/XLS), Word (DOCX)"
|
| 540 |
)
|
| 541 |
|
| 542 |
+
if uploaded_file is not None:
|
| 543 |
+
st.success(f"π File loaded: **{uploaded_file.name}**")
|
| 544 |
+
|
| 545 |
+
# Show file info
|
| 546 |
+
with st.expander("File Details", expanded=False):
|
| 547 |
+
st.write(f"**Name:** {uploaded_file.name}")
|
| 548 |
+
st.write(f"**Size:** {len(uploaded_file.getvalue()):,} bytes")
|
| 549 |
+
st.write(f"**Type:** {uploaded_file.type}")
|
| 550 |
+
|
| 551 |
+
# Process the file
|
| 552 |
+
file_name = uploaded_file.name.lower()
|
| 553 |
|
| 554 |
with st.spinner(f"Processing {uploaded_file.name}..."):
|
| 555 |
+
try:
|
| 556 |
+
if file_name.endswith('.txt'):
|
| 557 |
+
uploaded_file.seek(0)
|
| 558 |
+
text = str(uploaded_file.read(), "utf-8")
|
| 559 |
+
|
| 560 |
+
elif file_name.endswith('.pdf'):
|
| 561 |
+
text = visualizer.extract_text_from_pdf(uploaded_file)
|
| 562 |
+
|
| 563 |
+
elif file_name.endswith('.csv'):
|
| 564 |
+
text = visualizer.extract_text_from_csv(uploaded_file)
|
| 565 |
+
|
| 566 |
+
elif file_name.endswith(('.xlsx', '.xls')):
|
| 567 |
+
text = visualizer.extract_text_from_excel(uploaded_file)
|
| 568 |
+
|
| 569 |
+
elif file_name.endswith('.docx'):
|
| 570 |
+
text = visualizer.extract_text_from_docx(uploaded_file)
|
| 571 |
+
|
| 572 |
+
else:
|
| 573 |
+
st.warning("Unsupported file type - trying as text...")
|
| 574 |
+
uploaded_file.seek(0)
|
| 575 |
+
text = str(uploaded_file.read(), "utf-8")
|
| 576 |
+
|
| 577 |
+
except Exception as e:
|
| 578 |
+
st.error(f"Error processing file: {str(e)}")
|
| 579 |
+
text = ""
|
| 580 |
|
| 581 |
+
# Show processing results
|
| 582 |
if text and len(text.strip()) > 0:
|
| 583 |
+
st.success(f"β
Extracted {len(text):,} characters")
|
| 584 |
+
|
| 585 |
+
# Show preview
|
| 586 |
+
preview_text = text[:300] + "..." if len(text) > 300 else text
|
| 587 |
+
st.text_area(
|
| 588 |
+
"Content Preview:",
|
| 589 |
+
value=preview_text,
|
| 590 |
+
height=100,
|
| 591 |
+
disabled=True,
|
| 592 |
+
help="First 300 characters of extracted text"
|
| 593 |
+
)
|
| 594 |
else:
|
| 595 |
+
st.error("β No text could be extracted from the file")
|
|
|
|
| 596 |
else:
|
| 597 |
+
st.info("π Choose a file to upload")
|
| 598 |
+
|
| 599 |
+
else: # Custom Input
|
| 600 |
text = st.text_area(
|
| 601 |
"Enter your text:",
|
| 602 |
height=200,
|
| 603 |
+
placeholder="Paste or type your text here to analyze different chunking strategies...",
|
| 604 |
help="Paste or type the text you want to analyze"
|
| 605 |
)
|
| 606 |
|
| 607 |
+
# Only show chunking options if we have text
|
| 608 |
+
if text and len(text.strip()) > 0:
|
| 609 |
+
st.divider()
|
| 610 |
+
|
| 611 |
+
# Method selection
|
| 612 |
+
st.subheader("π§ Chunking Methods")
|
| 613 |
+
|
| 614 |
+
method_options = {
|
| 615 |
+
'Fixed Size': 'Character-based splitting with word boundaries',
|
| 616 |
+
'Sentence-based': 'Group by sentences for readability',
|
| 617 |
+
'Paragraph-based': 'Respect document structure',
|
| 618 |
+
'Recursive': 'Hierarchical splitting with multiple separators'
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
selected_methods = []
|
| 622 |
+
for method, description in method_options.items():
|
| 623 |
+
if st.checkbox(method, value=method in ['Fixed Size', 'Sentence-based'], help=description):
|
| 624 |
+
selected_methods.append(method)
|
| 625 |
+
|
| 626 |
+
if not selected_methods:
|
| 627 |
+
st.warning("β οΈ Select at least one chunking method")
|
| 628 |
+
|
| 629 |
+
st.divider()
|
| 630 |
+
|
| 631 |
+
# Parameters
|
| 632 |
+
st.subheader("βοΈ Parameters")
|
| 633 |
+
|
| 634 |
+
params = {}
|
| 635 |
+
|
| 636 |
+
if 'Fixed Size' in selected_methods:
|
| 637 |
+
st.markdown("**Fixed Size Settings**")
|
| 638 |
+
params['chunk_size'] = st.slider("Chunk size (characters)", 200, 2000, 800, step=50)
|
| 639 |
+
params['overlap'] = st.slider("Overlap (characters)", 0, 300, 100, step=25)
|
| 640 |
+
|
| 641 |
+
if 'Sentence-based' in selected_methods:
|
| 642 |
+
st.markdown("**Sentence-based Settings**")
|
| 643 |
+
params['sentences_per_chunk'] = st.slider("Sentences per chunk", 1, 10, 4)
|
| 644 |
+
|
| 645 |
+
if 'Recursive' in selected_methods:
|
| 646 |
+
st.markdown("**Recursive Settings**")
|
| 647 |
+
params['max_recursive_size'] = st.slider("Max chunk size", 500, 2000, 1200, step=100)
|
| 648 |
+
else:
|
| 649 |
+
selected_methods = []
|
| 650 |
+
params = {}
|
| 651 |
|
| 652 |
+
# Main content area
|
| 653 |
+
if text and len(text.strip()) > 0 and selected_methods:
|
| 654 |
+
# Process text with selected methods
|
| 655 |
with st.spinner("Processing chunks..."):
|
| 656 |
all_results = {}
|
| 657 |
|
|
|
|
| 673 |
|
| 674 |
all_results[method] = chunks
|
| 675 |
|
| 676 |
+
st.success(f"β
Processed {len(text):,} characters with {len(selected_methods)} methods")
|
| 677 |
|
| 678 |
+
# Display results in tabs
|
| 679 |
tabs = st.tabs([f"π {method}" for method in selected_methods] + ["π Comparison"])
|
| 680 |
|
| 681 |
+
# Individual method tabs
|
| 682 |
for i, (method, chunks) in enumerate(all_results.items()):
|
| 683 |
with tabs[i]:
|
| 684 |
+
metrics = visualizer.calculate_metrics(chunks)
|
| 685 |
|
| 686 |
+
# Metrics display
|
| 687 |
col1, col2, col3, col4, col5 = st.columns(5)
|
| 688 |
with col1:
|
| 689 |
st.metric("Total Chunks", metrics.get('total_chunks', 0))
|
|
|
|
| 697 |
overlap_pct = metrics.get('overlap_ratio', 0) * 100
|
| 698 |
st.metric("Overlap", f"{overlap_pct:.1f}%")
|
| 699 |
|
| 700 |
+
# Visualize chunks
|
| 701 |
+
visualizer.visualize_chunks(chunks)
|
| 702 |
|
| 703 |
+
# Size distribution chart
|
| 704 |
if len(chunks) > 1:
|
| 705 |
sizes = [chunk['char_count'] for chunk in chunks]
|
| 706 |
fig = px.histogram(
|
|
|
|
| 709 |
labels={'x': 'Characters', 'y': 'Count'}
|
| 710 |
)
|
| 711 |
fig.update_layout(height=300)
|
| 712 |
+
st.plotly_chart(fig, width='stretch')
|
| 713 |
|
| 714 |
+
# Comparison tab
|
| 715 |
with tabs[-1]:
|
| 716 |
st.header("π Comprehensive Analysis")
|
| 717 |
|
| 718 |
+
# Comparison charts
|
| 719 |
+
visualizer.create_comparison_charts(all_results)
|
| 720 |
|
| 721 |
+
# Metrics table
|
| 722 |
st.subheader("π Detailed Metrics Comparison")
|
| 723 |
|
| 724 |
comparison_data = []
|
| 725 |
for method, chunks in all_results.items():
|
| 726 |
+
metrics = visualizer.calculate_metrics(chunks)
|
| 727 |
comparison_data.append({
|
| 728 |
'Method': method,
|
| 729 |
'Chunks': metrics.get('total_chunks', 0),
|
|
|
|
| 734 |
})
|
| 735 |
|
| 736 |
df_comparison = pd.DataFrame(comparison_data)
|
| 737 |
+
st.dataframe(df_comparison, width='stretch')
|
| 738 |
|
| 739 |
+
# Recommendations
|
| 740 |
+
st.subheader("π‘ Recommendations")
|
| 741 |
|
| 742 |
best_consistency = max(all_results.keys(),
|
| 743 |
+
key=lambda m: visualizer.calculate_metrics(all_results[m]).get('size_consistency', 0))
|
| 744 |
|
| 745 |
optimal_size_method = min(all_results.keys(),
|
| 746 |
+
key=lambda m: abs(visualizer.calculate_metrics(all_results[m]).get('avg_chars', 1000) - 600))
|
| 747 |
|
| 748 |
col1, col2 = st.columns(2)
|
| 749 |
|
| 750 |
with col1:
|
| 751 |
st.success(f"π― **Most Consistent**: {best_consistency}")
|
| 752 |
+
consistency_score = visualizer.calculate_metrics(all_results[best_consistency]).get('size_consistency', 0)
|
| 753 |
st.write(f"Consistency score: {consistency_score:.3f}")
|
| 754 |
|
| 755 |
with col2:
|
| 756 |
st.info(f"βοΈ **Optimal Size**: {optimal_size_method}")
|
| 757 |
+
avg_size = visualizer.calculate_metrics(all_results[optimal_size_method]).get('avg_chars', 0)
|
| 758 |
st.write(f"Average size: {avg_size:.0f} characters")
|
| 759 |
|
| 760 |
+
# Use case recommendations
|
| 761 |
st.markdown("### π‘ Use Case Recommendations")
|
| 762 |
|
| 763 |
recommendations = {
|
|
|
|
| 770 |
|
| 771 |
for use_case, recommendation in recommendations.items():
|
| 772 |
st.markdown(f"- {use_case}: {recommendation}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
|
| 774 |
else:
|
| 775 |
+
# Welcome screen when no text is provided
|
| 776 |
st.markdown("""
|
| 777 |
+
## π Welcome to the RAG Chunk Visualizer
|
| 778 |
|
| 779 |
This tool analyzes how different chunking strategies split your documents for RAG systems.
|
| 780 |
|
| 781 |
+
### π Getting Started
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
|
| 783 |
+
**Step 1:** Choose your input method in the sidebar:
|
| 784 |
+
- **π Upload File**: Support for PDF, Excel, CSV, Word, and text files
|
| 785 |
+
- **βοΈ Custom Input**: Paste or type your own text
|
|
|
|
|
|
|
|
|
|
| 786 |
|
| 787 |
+
**Step 2:** Select chunking methods to compare (2-3 recommended)
|
| 788 |
+
|
| 789 |
+
**Step 3:** Adjust parameters for each method
|
| 790 |
+
|
| 791 |
+
**Step 4:** Analyze results with comprehensive metrics and visualizations
|
| 792 |
+
|
| 793 |
+
### π§ Available Chunking Methods
|
| 794 |
|
|
|
|
| 795 |
- **Fixed Size**: Consistent character-based chunks with word boundaries
|
| 796 |
+
- **Sentence-based**: Natural language flow with sentence grouping
|
| 797 |
- **Paragraph-based**: Document structure preservation
|
| 798 |
- **Recursive**: Hierarchical splitting with multiple separators
|
| 799 |
|
| 800 |
+
### π― Key Features
|
| 801 |
+
|
| 802 |
+
- **Real-time comparison** of different chunking strategies
|
| 803 |
+
- **Advanced metrics** including consistency scores and overlap analysis
|
| 804 |
+
- **Interactive visualizations** with detailed chunk inspection
|
| 805 |
+
- **Professional recommendations** for different use cases
|
| 806 |
+
- **Multi-format support** for various document types
|
| 807 |
+
|
| 808 |
+
### π Supported File Formats
|
| 809 |
+
|
| 810 |
+
- **π PDF**: Research papers, reports, documentation
|
| 811 |
+
- **π Excel (XLSX/XLS)**: Spreadsheets, data tables, financial reports
|
| 812 |
+
- **π CSV**: Data exports, logs, structured datasets
|
| 813 |
+
- **π Word (DOCX)**: Business documents, proposals, manuscripts
|
| 814 |
+
- **π Text (TXT)**: Plain text files, code, notes
|
| 815 |
+
|
| 816 |
+
---
|
| 817 |
|
| 818 |
+
**Ready to begin?** Select your input method in the sidebar! π
|
| 819 |
""")
|
| 820 |
|
| 821 |
+
# Show example use cases
|
| 822 |
+
st.subheader("π‘ Example Use Cases")
|
| 823 |
|
| 824 |
col1, col2, col3 = st.columns(3)
|
| 825 |
|
| 826 |
with col1:
|
| 827 |
st.markdown("""
|
| 828 |
+
**π RAG Optimization**
|
| 829 |
+
- Find optimal chunk sizes
|
| 830 |
+
- Minimize overlap issues
|
| 831 |
+
- Improve retrieval accuracy
|
| 832 |
+
- Balance context vs precision
|
| 833 |
""")
|
| 834 |
|
| 835 |
with col2:
|
| 836 |
st.markdown("""
|
| 837 |
+
**π Document Processing**
|
| 838 |
+
- Preserve document structure
|
| 839 |
+
- Handle different file formats
|
| 840 |
+
- Maintain readability
|
| 841 |
+
- Process large documents
|
| 842 |
""")
|
| 843 |
|
| 844 |
with col3:
|
| 845 |
st.markdown("""
|
| 846 |
+
**π€ LLM Integration**
|
| 847 |
+
- Manage token limits
|
| 848 |
+
- Optimize context windows
|
| 849 |
+
- Improve response quality
|
| 850 |
+
- Reduce processing costs
|
| 851 |
""")
|
| 852 |
|
| 853 |
if __name__ == "__main__":
|