LvMAC commited on
Commit
45ff068
Β·
verified Β·
1 Parent(s): 9bbe86e

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +885 -31
src/streamlit_app.py CHANGED
@@ -1,40 +1,894 @@
1
- import altair as alt
 
2
  import numpy as np
 
 
 
3
  import pandas as pd
4
- import streamlit as st
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
 
 
 
8
 
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
1
+ import streamlit as st
2
+ import re
3
  import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from plotly.subplots import make_subplots
7
  import pandas as pd
8
+ import io
9
+ import time
10
+ from typing import List, Dict, Any
11
+ import PyPDF2
12
+ import openpyxl
13
+ from docx import Document
14
+ import csv
15
 
16
+ # Safe model loading without cache permission issues
17
+ @st.cache_resource
18
+ def load_sentence_transformer():
19
+ st.info("⚠️ Semantic chunking disabled in HuggingFace environment")
20
+ return None
21
 
22
+ @st.cache_resource
23
+ def load_nltk():
24
+ try:
25
+ import nltk
26
+ try:
27
+ nltk.data.find('tokenizers/punkt')
28
+ except LookupError:
29
+ try:
30
+ nltk.download('punkt', quiet=True)
31
+ except:
32
+ pass # Skip if download fails
33
+ return nltk
34
+ except ImportError:
35
+ return None
36
 
37
+ class ProductionChunkVisualizer:
38
+ def __init__(self):
39
+ self.colors = [
40
+ '#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57',
41
+ '#FD79A8', '#A29BFE', '#6C5CE7', '#74B9FF', '#00B894'
42
+ ]
43
+ self.model = None
44
+ self.nltk = None
45
+
46
+ def initialize_models(self):
47
+ """Lazy load models only when needed"""
48
+ if self.model is None:
49
+ self.model = load_sentence_transformer()
50
+
51
+ if self.nltk is None:
52
+ self.nltk = load_nltk()
53
+
54
+ def extract_text_from_pdf(self, pdf_file):
55
+ """Extract text from PDF file"""
56
+ try:
57
+ # Reset file pointer to beginning
58
+ pdf_file.seek(0)
59
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
60
+ text = ""
61
+
62
+ st.write(f"πŸ“„ Processing PDF with {len(pdf_reader.pages)} pages...")
63
+
64
+ for page_num, page in enumerate(pdf_reader.pages):
65
+ try:
66
+ page_text = page.extract_text()
67
+ if page_text.strip():
68
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
69
+ except Exception as e:
70
+ st.warning(f"Could not extract text from page {page_num + 1}: {str(e)}")
71
+
72
+ if not text.strip():
73
+ st.warning("PDF appears to be image-based or empty. No text extracted.")
74
+ return "No extractable text found in PDF document."
75
+
76
+ return text.strip()
77
+ except Exception as e:
78
+ st.error(f"Error reading PDF: {str(e)}")
79
+ return f"PDF processing error: {str(e)}"
80
+
81
+ def extract_text_from_excel(self, excel_file):
82
+ """Extract text from Excel file"""
83
+ try:
84
+ # Reset file pointer to beginning
85
+ excel_file.seek(0)
86
+
87
+ # Try different engines
88
+ try:
89
+ xl_data = pd.read_excel(excel_file, sheet_name=None, engine='openpyxl')
90
+ except:
91
+ try:
92
+ xl_data = pd.read_excel(excel_file, sheet_name=None, engine='xlrd')
93
+ except:
94
+ xl_data = pd.read_excel(excel_file, sheet_name=None)
95
+
96
+ text = ""
97
+ sheet_count = len(xl_data)
98
+ st.write(f"πŸ“Š Processing Excel file with {sheet_count} sheet(s)...")
99
+
100
+ for sheet_name, df in xl_data.items():
101
+ text += f"\n=== Sheet: {sheet_name} ===\n"
102
+
103
+ if not df.empty:
104
+ # Add column headers
105
+ headers = " | ".join(str(col) for col in df.columns)
106
+ text += f"Headers: {headers}\n"
107
+ text += "-" * 50 + "\n"
108
+
109
+ # Add data rows (limit to prevent massive output)
110
+ max_rows = min(100, len(df)) # Limit to 100 rows per sheet
111
+ for idx, row in df.head(max_rows).iterrows():
112
+ row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
113
+ text += row_text + "\n"
114
+
115
+ if len(df) > max_rows:
116
+ text += f"... ({len(df) - max_rows} more rows)\n"
117
+ else:
118
+ text += "Empty sheet\n"
119
+
120
+ text += "\n"
121
+
122
+ return text.strip()
123
+ except Exception as e:
124
+ st.error(f"Error reading Excel file: {str(e)}")
125
+ return f"Excel processing error: {str(e)}"
126
+
127
+ def extract_text_from_csv(self, csv_file):
128
+ """Extract text from CSV file"""
129
+ try:
130
+ # Reset file pointer to beginning
131
+ csv_file.seek(0)
132
+
133
+ # Try different encodings
134
+ for encoding in ['utf-8', 'latin-1', 'cp1252']:
135
+ try:
136
+ csv_file.seek(0)
137
+ df = pd.read_csv(csv_file, encoding=encoding)
138
+ break
139
+ except UnicodeDecodeError:
140
+ continue
141
+ else:
142
+ df = pd.read_csv(csv_file) # Default encoding
143
+
144
+ if df.empty:
145
+ return "Empty CSV file"
146
+
147
+ st.write(f"πŸ“‹ Processing CSV with {len(df)} rows and {len(df.columns)} columns...")
148
+
149
+ # Create readable text format
150
+ text = "=== CSV Data ===\n"
151
+ headers = " | ".join(str(col) for col in df.columns)
152
+ text += f"Headers: {headers}\n"
153
+ text += "-" * 50 + "\n"
154
+
155
+ # Limit rows to prevent massive output
156
+ max_rows = min(100, len(df))
157
+ for _, row in df.head(max_rows).iterrows():
158
+ row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
159
+ text += row_text + "\n"
160
+
161
+ if len(df) > max_rows:
162
+ text += f"... ({len(df) - max_rows} more rows)\n"
163
+
164
+ return text.strip()
165
+ except Exception as e:
166
+ st.error(f"Error reading CSV file: {str(e)}")
167
+ return f"CSV processing error: {str(e)}"
168
+
169
+ def extract_text_from_docx(self, docx_file):
170
+ """Extract text from Word document"""
171
+ try:
172
+ doc = Document(docx_file)
173
+ text = ""
174
+
175
+ for paragraph in doc.paragraphs:
176
+ if paragraph.text.strip():
177
+ text += paragraph.text + "\n"
178
+
179
+ # Also extract text from tables
180
+ for table in doc.tables:
181
+ text += "\n=== Table ===\n"
182
+ for row in table.rows:
183
+ row_text = " | ".join(cell.text.strip() for cell in row.cells)
184
+ text += row_text + "\n"
185
+ text += "\n"
186
+
187
+ return text.strip()
188
+ except Exception as e:
189
+ st.error(f"Error reading Word document: {str(e)}")
190
+ return ""
191
+
192
+ def simple_sentence_split(self, text: str) -> List[str]:
193
+ """Fallback sentence splitting without NLTK"""
194
+ sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
195
+ return [s.strip() for s in sentences if s.strip()]
196
+
197
+ def robust_sentence_split(self, text: str) -> List[str]:
198
+ """Use NLTK if available, fallback to regex"""
199
+ if self.nltk:
200
+ try:
201
+ return self.nltk.sent_tokenize(text)
202
+ except:
203
+ pass
204
+ return self.simple_sentence_split(text)
205
+
206
+ def fixed_size_chunking(self, text: str, chunk_size: int, overlap_size: int = 0) -> List[Dict]:
207
+ """Split text into fixed-size chunks with word boundary respect"""
208
+ chunks = []
209
+ start = 0
210
+
211
+ while start < len(text):
212
+ end = start + chunk_size
213
+
214
+ if end >= len(text):
215
+ chunk = text[start:]
216
+ else:
217
+ chunk = text[start:end]
218
+ # Find last complete word
219
+ if not text[end].isspace():
220
+ last_space = chunk.rfind(' ')
221
+ if last_space > chunk_size * 0.7:
222
+ chunk = chunk[:last_space]
223
+ end = start + last_space
224
+
225
+ if chunk.strip():
226
+ chunks.append({
227
+ 'text': chunk.strip(),
228
+ 'start': start,
229
+ 'end': end if end < len(text) else len(text),
230
+ 'method': 'Fixed Size',
231
+ 'word_count': len(chunk.split()),
232
+ 'char_count': len(chunk.strip())
233
+ })
234
+
235
+ start = end - overlap_size
236
+ if start >= len(text):
237
+ break
238
+
239
+ return chunks
240
+
241
+ def sentence_chunking(self, text: str, sentences_per_chunk: int = 3) -> List[Dict]:
242
+ """Split text into sentence-based chunks"""
243
+ sentences = self.robust_sentence_split(text)
244
+ chunks = []
245
+ current_pos = 0
246
+
247
+ for i in range(0, len(sentences), sentences_per_chunk):
248
+ chunk_sentences = sentences[i:i + sentences_per_chunk]
249
+ chunk_text = ' '.join(chunk_sentences)
250
+
251
+ # Find actual position in original text
252
+ start_pos = text.find(chunk_sentences[0], current_pos)
253
+ if start_pos == -1:
254
+ start_pos = current_pos
255
+
256
+ end_pos = start_pos + len(chunk_text)
257
+ current_pos = end_pos
258
+
259
+ chunks.append({
260
+ 'text': chunk_text,
261
+ 'start': start_pos,
262
+ 'end': min(end_pos, len(text)),
263
+ 'method': 'Sentence-based',
264
+ 'sentence_count': len(chunk_sentences),
265
+ 'word_count': len(chunk_text.split()),
266
+ 'char_count': len(chunk_text)
267
+ })
268
+
269
+ return chunks
270
+
271
+ def paragraph_chunking(self, text: str) -> List[Dict]:
272
+ """Split text by paragraph boundaries"""
273
+ paragraphs = re.split(r'\n\s*\n', text)
274
+ chunks = []
275
+ current_pos = 0
276
+
277
+ for para in paragraphs:
278
+ para = para.strip()
279
+ if para:
280
+ start_pos = text.find(para, current_pos)
281
+ if start_pos == -1:
282
+ start_pos = current_pos
283
+
284
+ end_pos = start_pos + len(para)
285
+
286
+ chunks.append({
287
+ 'text': para,
288
+ 'start': start_pos,
289
+ 'end': end_pos,
290
+ 'method': 'Paragraph-based',
291
+ 'paragraph_length': len(para),
292
+ 'word_count': len(para.split()),
293
+ 'char_count': len(para)
294
+ })
295
+
296
+ current_pos = end_pos
297
+
298
+ return chunks
299
+
300
+ def semantic_chunking(self, text: str, similarity_threshold: float = 0.5) -> List[Dict]:
301
+ """Disabled semantic chunking - fallback to sentence-based"""
302
+ st.warning("Semantic chunking unavailable in this environment. Using sentence-based fallback.")
303
+ return self.sentence_chunking(text, 3)
304
+
305
+ def recursive_chunking(self, text: str, max_chunk_size: int = 1000) -> List[Dict]:
306
+ """Hierarchical text splitting with multiple separators"""
307
+ separators = ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " "]
308
+
309
+ def _recursive_split(text: str, separators: List[str], max_size: int, depth: int = 0) -> List[str]:
310
+ if len(text) <= max_size or depth > len(separators):
311
+ return [text]
312
+
313
+ separator = separators[0] if separators else " "
314
+
315
+ if separator not in text:
316
+ if len(separators) > 1:
317
+ return _recursive_split(text, separators[1:], max_size, depth + 1)
318
+ else:
319
+ return [text[i:i+max_size] for i in range(0, len(text), max_size)]
320
+
321
+ parts = text.split(separator)
322
+ result = []
323
+ current_chunk = ""
324
+
325
+ for part in parts:
326
+ potential_chunk = current_chunk + part + separator
327
+
328
+ if len(potential_chunk) <= max_size:
329
+ current_chunk = potential_chunk
330
+ else:
331
+ if current_chunk:
332
+ result.append(current_chunk.rstrip(separator))
333
+
334
+ if len(part) > max_size:
335
+ result.extend(_recursive_split(part, separators[1:], max_size, depth + 1))
336
+ current_chunk = ""
337
+ else:
338
+ current_chunk = part + separator
339
+
340
+ if current_chunk:
341
+ result.append(current_chunk.rstrip(separator))
342
+
343
+ return result
344
+
345
+ split_texts = _recursive_split(text, separators, max_chunk_size)
346
+ chunks = []
347
+ current_pos = 0
348
+
349
+ for chunk_text in split_texts:
350
+ if chunk_text.strip():
351
+ start_pos = text.find(chunk_text, current_pos)
352
+ if start_pos == -1:
353
+ start_pos = current_pos
354
+
355
+ end_pos = start_pos + len(chunk_text)
356
+
357
+ chunks.append({
358
+ 'text': chunk_text,
359
+ 'start': start_pos,
360
+ 'end': end_pos,
361
+ 'method': 'Recursive',
362
+ 'max_size': max_chunk_size,
363
+ 'word_count': len(chunk_text.split()),
364
+ 'char_count': len(chunk_text)
365
+ })
366
+
367
+ current_pos = end_pos
368
+
369
+ return chunks
370
+
371
+ def calculate_advanced_metrics(self, chunks: List[Dict]) -> Dict[str, Any]:
372
+ """Calculate comprehensive chunk metrics"""
373
+ if not chunks:
374
+ return {}
375
+
376
+ char_counts = [chunk['char_count'] for chunk in chunks]
377
+ word_counts = [chunk['word_count'] for chunk in chunks]
378
+
379
+ overlap_ratio = 0
380
+ if len(chunks) > 1:
381
+ total_chars = sum(char_counts)
382
+ text_length = max(chunk['end'] for chunk in chunks)
383
+ if text_length > 0:
384
+ overlap_ratio = max(0, (total_chars - text_length) / text_length)
385
+
386
+ char_cv = np.std(char_counts) / np.mean(char_counts) if np.mean(char_counts) > 0 else 0
387
+ word_cv = np.std(word_counts) / np.mean(word_counts) if np.mean(word_counts) > 0 else 0
388
+
389
+ return {
390
+ 'total_chunks': len(chunks),
391
+ 'avg_chars': np.mean(char_counts),
392
+ 'std_chars': np.std(char_counts),
393
+ 'min_chars': min(char_counts),
394
+ 'max_chars': max(char_counts),
395
+ 'avg_words': np.mean(word_counts),
396
+ 'std_words': np.std(word_counts),
397
+ 'char_cv': char_cv,
398
+ 'word_cv': word_cv,
399
+ 'overlap_ratio': overlap_ratio,
400
+ 'size_consistency': 1 - char_cv,
401
+ 'total_coverage': sum(chunk['end'] - chunk['start'] for chunk in chunks)
402
+ }
403
+
404
+ def visualize_chunks_advanced(self, text: str, chunks: List[Dict]):
405
+ """Advanced chunk visualization"""
406
+ if not chunks:
407
+ st.write("No chunks to display")
408
+ return
409
+
410
+ st.markdown("### 🎨 Interactive Chunk Visualization")
411
+
412
+ for i, chunk in enumerate(chunks):
413
+ color = self.colors[i % len(self.colors)]
414
+
415
+ words_per_sentence = chunk['word_count'] / max(1, chunk.get('sentence_count', 1))
416
+
417
+ st.markdown(f"""
418
+ <div style='background: linear-gradient(135deg, {color}15, {color}25);
419
+ border-left: 5px solid {color};
420
+ padding: 15px;
421
+ margin: 10px 0;
422
+ border-radius: 8px;
423
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
424
+ <div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;'>
425
+ <div style='color: {color}; font-weight: bold; font-size: 14px;'>
426
+ CHUNK {i+1} β€’ Position {chunk['start']}-{chunk['end']}
427
+ </div>
428
+ <div style='color: #666; font-size: 12px;'>
429
+ {chunk['char_count']} chars β€’ {chunk['word_count']} words
430
+ </div>
431
+ </div>
432
+ <div style='color: #333; line-height: 1.6; font-size: 14px;'>
433
+ {chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}
434
+ </div>
435
+ <div style='margin-top: 8px; color: #888; font-size: 11px;'>
436
+ Quality: {words_per_sentence:.1f} words/sentence
437
+ </div>
438
+ </div>
439
+ """, unsafe_allow_html=True)
440
+
441
+ def create_comprehensive_charts(self, all_results: Dict[str, List[Dict]]):
442
+ """Create detailed analysis charts"""
443
+ if not all_results:
444
+ return
445
+
446
+ metrics_data = []
447
+ size_data = []
448
+
449
+ for method, chunks in all_results.items():
450
+ metrics = self.calculate_advanced_metrics(chunks)
451
+ metrics_data.append({
452
+ 'Method': method,
453
+ 'Chunks': metrics.get('total_chunks', 0),
454
+ 'Avg Size': metrics.get('avg_chars', 0),
455
+ 'Consistency': metrics.get('size_consistency', 0),
456
+ 'Overlap': metrics.get('overlap_ratio', 0)
457
+ })
458
+
459
+ for chunk in chunks:
460
+ size_data.append({
461
+ 'Method': method,
462
+ 'Size': chunk['char_count'],
463
+ 'Words': chunk['word_count']
464
+ })
465
+
466
+ fig = make_subplots(
467
+ rows=2, cols=2,
468
+ subplot_titles=(
469
+ 'Chunk Count Comparison',
470
+ 'Size Consistency',
471
+ 'Size Distribution by Method',
472
+ 'Words vs Characters'
473
+ ),
474
+ specs=[
475
+ [{"type": "bar"}, {"type": "bar"}],
476
+ [{"type": "box"}, {"type": "scatter"}]
477
+ ]
478
+ )
479
+
480
+ df_metrics = pd.DataFrame(metrics_data)
481
+ df_sizes = pd.DataFrame(size_data)
482
+
483
+ # Chart 1: Chunk counts
484
+ fig.add_trace(
485
+ go.Bar(x=df_metrics['Method'], y=df_metrics['Chunks'],
486
+ name='Chunk Count', marker_color='lightblue'),
487
+ row=1, col=1
488
+ )
489
+
490
+ # Chart 2: Consistency scores
491
+ fig.add_trace(
492
+ go.Bar(x=df_metrics['Method'], y=df_metrics['Consistency'],
493
+ name='Consistency', marker_color='lightgreen'),
494
+ row=1, col=2
495
+ )
496
+
497
+ # Chart 3: Size distribution box plots
498
+ for method in df_sizes['Method'].unique():
499
+ method_data = df_sizes[df_sizes['Method'] == method]
500
+ fig.add_trace(
501
+ go.Box(y=method_data['Size'], name=method, boxpoints='outliers'),
502
+ row=2, col=1
503
+ )
504
+
505
+ # Chart 4: Words vs Characters scatter
506
+ for method in df_sizes['Method'].unique():
507
+ method_data = df_sizes[df_sizes['Method'] == method]
508
+ fig.add_trace(
509
+ go.Scatter(x=method_data['Words'], y=method_data['Size'],
510
+ mode='markers', name=method, opacity=0.7),
511
+ row=2, col=2
512
+ )
513
+
514
+ fig.update_layout(height=800, showlegend=True)
515
+ fig.update_xaxes(tickangle=45)
516
+
517
+ st.plotly_chart(fig, width='stretch')
518
 
519
+ def main():
520
+ st.set_page_config(
521
+ page_title="Multi-Format RAG Chunk Visualizer",
522
+ page_icon="πŸ”",
523
+ layout="wide",
524
+ initial_sidebar_state="expanded"
525
+ )
526
+
527
+ col1, col2 = st.columns([3, 1])
528
+ with col1:
529
+ st.title("πŸ” Multi-Format RAG Chunk Visualizer")
530
+ st.markdown("**Professional chunking analysis with support for PDF, Excel, CSV, Word & Text files**")
531
+
532
+ with col2:
533
+ if st.button("ℹ️ About", help="Learn about chunking strategies"):
534
+ with st.expander("Chunking Methods Explained", expanded=True):
535
+ st.markdown("""
536
+ **Fixed Size**: Splits text at character boundaries with word respect
537
+ **Sentence-based**: Groups sentences together for semantic coherence
538
+ **Paragraph-based**: Respects document structure and topic boundaries
539
+ **Recursive**: Hierarchical splitting using multiple separators
540
+
541
+ *Note: Semantic chunking disabled in this environment*
542
+ """)
543
+
544
+ visualizer = ProductionChunkVisualizer()
545
+
546
+ with st.sidebar:
547
+ st.header("βš™οΈ Configuration")
548
+
549
+ input_method = st.radio(
550
+ "Choose input method:",
551
+ ["πŸ“ Sample Text", "πŸ“ Upload File", "✏️ Custom Input"],
552
+ help="Select how you want to provide text for analysis"
553
+ )
554
+
555
+ sample_texts = {
556
+ "Research Paper Abstract": """Machine learning has fundamentally transformed the landscape of artificial intelligence research. Recent advances in deep learning architectures, particularly transformer-based models, have demonstrated unprecedented capabilities in natural language understanding and generation. These models leverage attention mechanisms to capture long-range dependencies in sequential data, enabling more sophisticated reasoning and contextual understanding. The implications extend beyond traditional NLP tasks to multimodal applications, including vision-language models and cross-modal reasoning systems. However, significant challenges remain in terms of computational efficiency, interpretability, and robustness to adversarial inputs.""",
557
+
558
+ "Technical Documentation": """Installation Prerequisites: Before beginning the installation process, ensure your system meets the following requirements. Python 3.8 or higher must be installed with pip package manager available. Node.js version 16.x or later is required for frontend dependencies. Git version control system should be accessible from command line.\n\nStep 1: Repository Setup\nClone the project repository using the following command: git clone https://github.com/company/rag-system.git. Navigate to the project directory and create a virtual environment: python -m venv rag-env. Activate the virtual environment using the appropriate command for your operating system.\n\nStep 2: Dependency Installation\nInstall Python dependencies by running pip install -r requirements.txt. This will install all necessary packages including transformers, sentence-transformers, and streamlit. For development dependencies, additionally run pip install -r requirements-dev.txt.""",
559
+
560
+ "Business Report": """Executive Summary: Q4 2024 Performance Analysis\n\nOur organization achieved exceptional growth in the fourth quarter of 2024, with revenue increasing by 42% year-over-year to reach $3.8 million. This growth was primarily driven by our expanded product portfolio and successful market penetration strategies in the enterprise segment.\n\nKey Performance Indicators demonstrate strong momentum across all business units. Customer acquisition costs decreased by 18% while customer lifetime value increased by 35%, indicating improved operational efficiency and customer satisfaction. Our newly launched AI-powered features contributed significantly to user engagement, with daily active users increasing by 67%.\n\nStrategic Initiatives for 2025 focus on international expansion and technology innovation. We plan to establish operations in three new markets: Germany, Japan, and Australia. Additionally, our R&D investment will increase by 50% to accelerate development of next-generation AI capabilities."""
561
+ }
562
+
563
+ if input_method == "πŸ“ Sample Text":
564
+ selected_sample = st.selectbox("Select sample text:", list(sample_texts.keys()))
565
+ text = sample_texts[selected_sample]
566
+ st.text_area("Preview:", value=text[:200] + "...", height=100, disabled=True)
567
+
568
+ elif input_method == "πŸ“ Upload File":
569
+ uploaded_file = st.file_uploader(
570
+ "Upload document",
571
+ type=['txt', 'pdf', 'csv', 'xlsx', 'xls', 'docx'],
572
+ help="Supports: TXT, PDF, CSV, Excel (XLSX/XLS), Word (DOCX)"
573
+ )
574
+
575
+ if uploaded_file:
576
+ file_type = uploaded_file.type
577
+
578
+ with st.spinner(f"Processing {uploaded_file.name}..."):
579
+ if file_type == "text/plain":
580
+ text = str(uploaded_file.read(), "utf-8")
581
+ elif file_type == "application/pdf":
582
+ text = visualizer.extract_text_from_pdf(uploaded_file)
583
+ elif file_type == "text/csv":
584
+ text = visualizer.extract_text_from_csv(uploaded_file)
585
+ elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
586
+ "application/vnd.ms-excel"]:
587
+ text = visualizer.extract_text_from_excel(uploaded_file)
588
+ elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
589
+ text = visualizer.extract_text_from_docx(uploaded_file)
590
+ else:
591
+ st.error(f"Unsupported file type: {file_type}")
592
+ text = sample_texts["Research Paper Abstract"]
593
+
594
+ if text and len(text.strip()) > 0:
595
+ st.success(f"βœ… Extracted {len(text)} characters from {uploaded_file.name}")
596
+ if len(text) > 1000:
597
+ st.text_area("Preview:", value=text[:500] + "...", height=100, disabled=True)
598
+ else:
599
+ st.error("No text could be extracted from the file")
600
+ text = sample_texts["Research Paper Abstract"]
601
+ else:
602
+ text = sample_texts["Research Paper Abstract"]
603
+ st.info("Using sample text until file is uploaded")
604
+ else:
605
+ text = st.text_area(
606
+ "Enter your text:",
607
+ height=200,
608
+ value=sample_texts["Business Document"],
609
+ help="Paste or type the text you want to analyze"
610
+ )
611
+
612
+ st.divider()
613
+
614
+ st.subheader("πŸ”§ Chunking Methods")
615
+
616
+ method_options = {
617
+ 'Fixed Size': 'Character-based splitting with word boundaries',
618
+ 'Sentence-based': 'Group by sentences for readability',
619
+ 'Paragraph-based': 'Respect document structure',
620
+ 'Recursive': 'Hierarchical splitting with multiple separators'
621
+ }
622
+
623
+ selected_methods = []
624
+ for method, description in method_options.items():
625
+ if st.checkbox(method, value=method in ['Fixed Size', 'Sentence-based'], help=description):
626
+ selected_methods.append(method)
627
+
628
+ if not selected_methods:
629
+ st.warning("⚠️ Select at least one chunking method")
630
+
631
+ st.divider()
632
+
633
+ st.subheader("βš™οΈ Parameters")
634
+
635
+ params = {}
636
+
637
+ if 'Fixed Size' in selected_methods:
638
+ st.markdown("**Fixed Size Settings**")
639
+ params['chunk_size'] = st.slider("Chunk size (characters)", 200, 2000, 800, step=50)
640
+ params['overlap'] = st.slider("Overlap (characters)", 0, 300, 100, step=25)
641
+
642
+ if 'Sentence-based' in selected_methods:
643
+ st.markdown("**Sentence-based Settings**")
644
+ params['sentences_per_chunk'] = st.slider("Sentences per chunk", 1, 10, 4)
645
+
646
+ if 'Recursive' in selected_methods:
647
+ st.markdown("**Recursive Settings**")
648
+ params['max_recursive_size'] = st.slider("Max chunk size", 500, 2000, 1200, step=100)
649
+
650
+ with st.expander("πŸ”¬ Advanced Options"):
651
+ show_overlap_analysis = st.checkbox("Show overlap analysis", value=True)
652
+ show_quality_metrics = st.checkbox("Show quality metrics", value=True)
653
+ export_results = st.checkbox("Enable result export", value=False)
654
+
655
+ if text and selected_methods:
656
+ with st.spinner("Processing chunks..."):
657
+ all_results = {}
658
+
659
+ for method in selected_methods:
660
+ if method == 'Fixed Size':
661
+ chunks = visualizer.fixed_size_chunking(
662
+ text, params.get('chunk_size', 800), params.get('overlap', 100)
663
+ )
664
+ elif method == 'Sentence-based':
665
+ chunks = visualizer.sentence_chunking(
666
+ text, params.get('sentences_per_chunk', 4)
667
+ )
668
+ elif method == 'Paragraph-based':
669
+ chunks = visualizer.paragraph_chunking(text)
670
+ elif method == 'Recursive':
671
+ chunks = visualizer.recursive_chunking(
672
+ text, params.get('max_recursive_size', 1200)
673
+ )
674
+
675
+ all_results[method] = chunks
676
+
677
+ st.success(f"βœ… Processed {len(text)} characters with {len(selected_methods)} methods")
678
+
679
+ tabs = st.tabs([f"πŸ“Š {method}" for method in selected_methods] + ["πŸ“ˆ Comparison"])
680
+
681
+ for i, (method, chunks) in enumerate(all_results.items()):
682
+ with tabs[i]:
683
+ metrics = visualizer.calculate_advanced_metrics(chunks)
684
+
685
+ col1, col2, col3, col4, col5 = st.columns(5)
686
+ with col1:
687
+ st.metric("Total Chunks", metrics.get('total_chunks', 0))
688
+ with col2:
689
+ st.metric("Avg Characters", f"{metrics.get('avg_chars', 0):.0f}")
690
+ with col3:
691
+ st.metric("Avg Words", f"{metrics.get('avg_words', 0):.0f}")
692
+ with col4:
693
+ st.metric("Consistency", f"{metrics.get('size_consistency', 0):.2f}")
694
+ with col5:
695
+ overlap_pct = metrics.get('overlap_ratio', 0) * 100
696
+ st.metric("Overlap", f"{overlap_pct:.1f}%")
697
+
698
+ visualizer.visualize_chunks_advanced(text, chunks)
699
+
700
+ if len(chunks) > 1:
701
+ sizes = [chunk['char_count'] for chunk in chunks]
702
+ fig = px.histogram(
703
+ x=sizes, nbins=min(20, len(chunks)),
704
+ title=f"{method} - Chunk Size Distribution",
705
+ labels={'x': 'Characters', 'y': 'Count'}
706
+ )
707
+ fig.update_layout(height=300)
708
+ st.plotly_chart(fig, use_container_width=True)
709
+
710
+ with tabs[-1]:
711
+ st.header("πŸ“ˆ Comprehensive Analysis")
712
+
713
+ visualizer.create_comprehensive_charts(all_results)
714
+
715
+ st.subheader("πŸ“Š Detailed Metrics Comparison")
716
+
717
+ comparison_data = []
718
+ for method, chunks in all_results.items():
719
+ metrics = visualizer.calculate_advanced_metrics(chunks)
720
+ comparison_data.append({
721
+ 'Method': method,
722
+ 'Chunks': metrics.get('total_chunks', 0),
723
+ 'Avg Size': f"{metrics.get('avg_chars', 0):.0f}",
724
+ 'Size StdDev': f"{metrics.get('std_chars', 0):.0f}",
725
+ 'Consistency': f"{metrics.get('size_consistency', 0):.3f}",
726
+ 'Overlap %': f"{metrics.get('overlap_ratio', 0)*100:.1f}%"
727
+ })
728
+
729
+ df_comparison = pd.DataFrame(comparison_data)
730
+ st.dataframe(df_comparison, use_container_width=True)
731
+
732
+ st.subheader("πŸ€– Intelligent Recommendations")
733
+
734
+ best_consistency = max(all_results.keys(),
735
+ key=lambda m: visualizer.calculate_advanced_metrics(all_results[m]).get('size_consistency', 0))
736
+
737
+ optimal_size_method = min(all_results.keys(),
738
+ key=lambda m: abs(visualizer.calculate_advanced_metrics(all_results[m]).get('avg_chars', 1000) - 600))
739
+
740
+ col1, col2 = st.columns(2)
741
+
742
+ with col1:
743
+ st.success(f"🎯 **Most Consistent**: {best_consistency}")
744
+ consistency_score = visualizer.calculate_advanced_metrics(all_results[best_consistency]).get('size_consistency', 0)
745
+ st.write(f"Consistency score: {consistency_score:.3f}")
746
+
747
+ with col2:
748
+ st.info(f"βš–οΈ **Optimal Size**: {optimal_size_method}")
749
+ avg_size = visualizer.calculate_advanced_metrics(all_results[optimal_size_method]).get('avg_chars', 0)
750
+ st.write(f"Average size: {avg_size:.0f} characters")
751
+
752
+ st.markdown("### πŸ’‘ Use Case Recommendations")
753
+
754
+ recommendations = {
755
+ "πŸ” **Search & Retrieval**": "Use Fixed Size (600-800 chars) for consistent embedding",
756
+ "πŸ“š **Document Processing**": "Use Paragraph-based to preserve structure",
757
+ "πŸ€– **LLM Input**": "Use Fixed Size (800-1200 chars) for token management",
758
+ "πŸ“– **Reading Comprehension**": "Use Sentence-based for natural flow",
759
+ "πŸ”„ **Data Pipeline**": "Use Recursive for robust processing"
760
+ }
761
+
762
+ for use_case, recommendation in recommendations.items():
763
+ st.markdown(f"- {use_case}: {recommendation}")
764
+
765
+ if export_results:
766
+ st.subheader("πŸ“€ Export Results")
767
+
768
+ report_data = {
769
+ 'text_length': len(text),
770
+ 'methods_used': list(all_results.keys()),
771
+ 'parameters': params,
772
+ 'results': {}
773
+ }
774
+
775
+ for method, chunks in all_results.items():
776
+ metrics = visualizer.calculate_advanced_metrics(chunks)
777
+ report_data['results'][method] = {
778
+ 'chunks': len(chunks),
779
+ 'metrics': metrics,
780
+ 'chunk_details': chunks
781
+ }
782
+
783
+ import json
784
+ report_json = json.dumps(report_data, indent=2, default=str)
785
+
786
+ col1, col2 = st.columns(2)
787
+
788
+ with col1:
789
+ st.download_button(
790
+ "πŸ“‹ Download Analysis Report (JSON)",
791
+ data=report_json,
792
+ file_name=f"chunk_analysis_{len(text)}_chars.json",
793
+ mime="application/json"
794
+ )
795
+
796
+ with col2:
797
+ markdown_report = f"""# Multi-Format Chunk Analysis Report
798
+
799
+ ## Text Analysis
800
+ - **Length**: {len(text):,} characters
801
+ - **Methods**: {', '.join(all_results.keys())}
802
+ - **Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}
803
 
804
+ ## Results Summary
805
+ """
806
+
807
+ for method, chunks in all_results.items():
808
+ metrics = visualizer.calculate_advanced_metrics(chunks)
809
+ markdown_report += f"""
810
+ ### {method} Method
811
+ - **Chunks**: {metrics.get('total_chunks', 0)}
812
+ - **Average Size**: {metrics.get('avg_chars', 0):.0f} characters
813
+ - **Consistency**: {metrics.get('size_consistency', 0):.3f}
814
+ - **Overlap**: {metrics.get('overlap_ratio', 0)*100:.1f}%
815
+ """
816
+
817
+ st.download_button(
818
+ "πŸ“„ Download Summary (Markdown)",
819
+ data=markdown_report,
820
+ file_name=f"chunk_summary_{len(text)}_chars.md",
821
+ mime="text/markdown"
822
+ )
823
 
824
+ else:
825
+ st.markdown("""
826
+ ## πŸ‘‹ Welcome to the Multi-Format RAG Chunk Visualizer
827
+
828
+ This tool analyzes how different chunking strategies split your documents for RAG systems.
829
+
830
+ ### πŸš€ Supported File Formats
831
+ - **πŸ“„ PDF**: Research papers, reports, documentation
832
+ - **πŸ“Š Excel (XLSX/XLS)**: Spreadsheets, data tables, financial reports
833
+ - **πŸ“‹ CSV**: Data exports, logs, structured datasets
834
+ - **πŸ“ Word (DOCX)**: Business documents, proposals, manuscripts
835
+ - **πŸ“œ Text (TXT)**: Plain text files, code, notes
836
+
837
+ ### 🎯 Key Features
838
+ - **4 chunking strategies** with real-time comparison
839
+ - **Advanced metrics** including consistency and overlap analysis
840
+ - **Interactive visualizations** with detailed chunk inspection
841
+ - **Export capabilities** for team collaboration
842
+ - **Professional recommendations** for different use cases
843
+
844
+ ### πŸ’‘ Quick Start
845
+ 1. **Upload your file** or use sample text
846
+ 2. **Select chunking methods** to compare (2-3 recommended)
847
+ 3. **Adjust parameters** for each method
848
+ 4. **Analyze results** with comprehensive metrics
849
+
850
+ ### πŸ”§ Chunking Methods Available
851
+ - **Fixed Size**: Consistent character-based chunks with word boundaries
852
+ - **Sentence-based**: Natural language flow with sentence grouping
853
+ - **Paragraph-based**: Document structure preservation
854
+ - **Recursive**: Hierarchical splitting with multiple separators
855
+
856
+ **Note**: Semantic chunking temporarily disabled in this environment
857
+
858
+ Select your settings in the sidebar to begin analysis! πŸ‘ˆ
859
+ """)
860
+
861
+ # Sample file format examples
862
+ st.subheader("πŸ“ Example Use Cases")
863
+
864
+ col1, col2, col3 = st.columns(3)
865
+
866
+ with col1:
867
+ st.markdown("""
868
+ **πŸ“„ PDF Files**
869
+ - Research papers
870
+ - Technical manuals
871
+ - Legal documents
872
+ - Reports and presentations
873
+ """)
874
+
875
+ with col2:
876
+ st.markdown("""
877
+ **πŸ“Š Excel/CSV Files**
878
+ - Data tables
879
+ - Survey results
880
+ - Financial reports
881
+ - Product catalogs
882
+ """)
883
+
884
+ with col3:
885
+ st.markdown("""
886
+ **πŸ“ Text/Word Files**
887
+ - Articles and blogs
888
+ - Meeting notes
889
+ - Technical documentation
890
+ - Business proposals
891
+ """)
892
 
893
+ if __name__ == "__main__":
894
+ main()