Ara Yeroyan commited on
Commit
763a8b9
Β·
1 Parent(s): 5262a14

add Retrieval Distribution stats

Browse files
Files changed (1) hide show
  1. app.py +126 -34
app.py CHANGED
@@ -213,6 +213,34 @@ st.markdown("""
213
  margin: 10px 0;
214
  border-left: 4px solid #007bff;
215
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  </style>
217
  """, unsafe_allow_html=True)
218
 
@@ -284,6 +312,7 @@ def extract_chunk_statistics(sources: List[Any]) -> Dict[str, Any]:
284
  sources_list = []
285
  years = []
286
  filenames = []
 
287
 
288
  for doc in sources:
289
  metadata = getattr(doc, 'metadata', {})
@@ -310,45 +339,67 @@ def extract_chunk_statistics(sources: List[Any]) -> Dict[str, Any]:
310
  # Extract filename
311
  filename = metadata.get('filename', 'Unknown')
312
  filenames.append(filename)
 
 
 
 
 
 
 
313
 
314
  # Count occurrences
315
  source_counts = Counter(sources_list)
316
  year_counts = Counter(years)
317
  filename_counts = Counter(filenames)
 
318
 
319
  return {
320
  'total_chunks': len(sources),
321
  'unique_sources': len(source_counts),
322
  'unique_years': len([y for y in year_counts.keys() if y != 'Unknown']),
323
  'unique_filenames': len(filename_counts),
 
324
  'source_distribution': dict(source_counts),
325
  'year_distribution': dict(year_counts),
326
  'filename_distribution': dict(filename_counts),
 
327
  'sources': sources_list,
328
  'years': years,
329
- 'filenames': filenames
 
330
  }
331
 
332
- def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retrieved Chunks Statistics"):
333
  """Display statistics as interactive charts for 10+ results."""
334
  if not stats or stats.get('total_chunks', 0) == 0:
335
  return
336
 
337
- st.subheader(f"πŸ“Š {title}")
338
-
339
- # Summary metrics
340
- col1, col2, col3, col4 = st.columns(4)
341
- with col1:
342
- st.metric("Total Chunks", stats['total_chunks'])
343
- with col2:
344
- st.metric("Unique Sources", stats['unique_sources'])
345
- with col3:
346
- st.metric("Unique Years", stats['unique_years'])
347
- with col4:
348
- st.metric("Unique Files", stats['unique_filenames'])
 
 
 
 
 
 
 
 
 
 
 
349
 
350
- # Charts side by side
351
- col1, col2 = st.columns(2)
352
 
353
  with col1:
354
  # Source distribution chart
@@ -397,12 +448,43 @@ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retriev
397
  st.plotly_chart(fig_year, use_container_width=True)
398
  else:
399
  st.info("No valid years found in the results")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
- def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieved Chunks Statistics"):
402
  """Display statistics as tables for smaller results with fixed alignment."""
403
  if not stats or stats.get('total_chunks', 0) == 0:
404
  return
405
 
 
 
 
406
  st.subheader(f"πŸ“Š {title}")
407
 
408
  # Create a container with fixed height for alignment
@@ -413,18 +495,20 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieve
413
  col1, col2, col3, col4 = st.columns(4)
414
 
415
  with col1:
416
- st.markdown("**πŸ“ˆ Summary**")
417
- summary_data = {
418
- "Metric": ["Total", "Sources", "Years", "Files"],
419
- "Count": [
420
- stats['total_chunks'],
421
- stats['unique_sources'],
422
- stats['unique_years'],
423
- stats['unique_filenames']
424
- ]
425
- }
426
- summary_df = pd.DataFrame(summary_data)
427
- st.dataframe(summary_df, hide_index=True, use_container_width=True)
 
 
428
 
429
  with col2:
430
  st.markdown("**πŸ“‚ Sources**")
@@ -472,6 +556,9 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieve
472
  st.dataframe(file_df, hide_index=True, use_container_width=True)
473
  else:
474
  st.write("No file data")
 
 
 
475
 
476
  @st.cache_data
477
  def load_filter_options():
@@ -545,11 +632,13 @@ def main():
545
  help="Choose specific reports to search. When enabled, all other filters are ignored."
546
  )
547
  st.markdown('</div>', unsafe_allow_html=True)
 
 
548
 
549
  # Determine if filename filter is active
550
  filename_mode = len(selected_filenames) > 0
551
  # Sources filter
552
- st.markdown('<div class="filter-section">', unsafe_allow_html=True)
553
  st.markdown('<div class="filter-title">πŸ“Š Sources</div>', unsafe_allow_html=True)
554
  selected_sources = st.multiselect(
555
  "Select sources:",
@@ -562,7 +651,7 @@ def main():
562
  st.markdown('</div>', unsafe_allow_html=True)
563
 
564
  # Years filter
565
- st.markdown('<div class="filter-section">', unsafe_allow_html=True)
566
  st.markdown('<div class="filter-title">πŸ“… Years</div>', unsafe_allow_html=True)
567
  selected_years = st.multiselect(
568
  "Select years:",
@@ -575,7 +664,7 @@ def main():
575
  st.markdown('</div>', unsafe_allow_html=True)
576
 
577
  # Districts filter
578
- st.markdown('<div class="filter-section">', unsafe_allow_html=True)
579
  st.markdown('<div class="filter-title">🏘️ Districts</div>', unsafe_allow_html=True)
580
  selected_districts = st.multiselect(
581
  "Select districts:",
@@ -820,9 +909,12 @@ def main():
820
 
821
  # Show charts for 10+ results, tables for fewer
822
  if len(sources) >= 10:
823
- display_chunk_statistics_charts(stats, "Retrieved Documents Statistics")
 
 
 
824
  else:
825
- display_chunk_statistics_table(stats, "Retrieved Documents Statistics")
826
 
827
  st.markdown("---")
828
  st.markdown("### πŸ“„ Document Details")
 
213
  margin: 10px 0;
214
  border-left: 4px solid #007bff;
215
  }
216
+
217
+ .retrieval-distribution-container {
218
+ background-color: #ffffff;
219
+ padding: 25px;
220
+ border-radius: 10px;
221
+ margin: 20px 0;
222
+ border: 2px solid #e0e0e0;
223
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1), 0 2px 4px rgba(0, 0, 0, 0.06);
224
+ }
225
+
226
+ .metric-label {
227
+ font-size: 0.9rem;
228
+ color: #555;
229
+ margin-bottom: 5px;
230
+ text-align: center;
231
+ }
232
+
233
+ .metric-value {
234
+ font-size: 1.8rem;
235
+ font-weight: bold;
236
+ color: #000000;
237
+ text-align: center;
238
+ }
239
+
240
+ .metric-container {
241
+ text-align: center;
242
+ padding: 10px;
243
+ }
244
  </style>
245
  """, unsafe_allow_html=True)
246
 
 
312
  sources_list = []
313
  years = []
314
  filenames = []
315
+ districts = []
316
 
317
  for doc in sources:
318
  metadata = getattr(doc, 'metadata', {})
 
339
  # Extract filename
340
  filename = metadata.get('filename', 'Unknown')
341
  filenames.append(filename)
342
+
343
+ # Extract district
344
+ district = metadata.get('district', 'Unknown')
345
+ if district and district != 'Unknown':
346
+ districts.append(district)
347
+ else:
348
+ districts.append('Unknown')
349
 
350
  # Count occurrences
351
  source_counts = Counter(sources_list)
352
  year_counts = Counter(years)
353
  filename_counts = Counter(filenames)
354
+ district_counts = Counter(districts)
355
 
356
  return {
357
  'total_chunks': len(sources),
358
  'unique_sources': len(source_counts),
359
  'unique_years': len([y for y in year_counts.keys() if y != 'Unknown']),
360
  'unique_filenames': len(filename_counts),
361
+ 'unique_districts': len([d for d in district_counts.keys() if d != 'Unknown']),
362
  'source_distribution': dict(source_counts),
363
  'year_distribution': dict(year_counts),
364
  'filename_distribution': dict(filename_counts),
365
+ 'district_distribution': dict(district_counts),
366
  'sources': sources_list,
367
  'years': years,
368
+ 'filenames': filenames,
369
+ 'districts': districts
370
  }
371
 
372
+ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retrieval Statistics"):
373
  """Display statistics as interactive charts for 10+ results."""
374
  if not stats or stats.get('total_chunks', 0) == 0:
375
  return
376
 
377
+ # Wrap everything in one styled container - open it
378
+ st.markdown(f"""
379
+ <div class="retrieval-distribution-container">
380
+ <h3 style="margin-top: 0;">πŸ“Š {title}</h3>
381
+ <div style="display: flex; justify-content: space-around; align-items: center; padding: 15px 0; border-bottom: 1px solid #e0e0e0; margin-bottom: 20px;">
382
+ <div class="metric-container">
383
+ <div class="metric-label">Total Chunks</div>
384
+ <div class="metric-value">{stats['total_chunks']}</div>
385
+ </div>
386
+ <div class="metric-container">
387
+ <div class="metric-label">Unique Sources</div>
388
+ <div class="metric-value">{stats['unique_sources']}</div>
389
+ </div>
390
+ <div class="metric-container">
391
+ <div class="metric-label">Unique Years</div>
392
+ <div class="metric-value">{stats['unique_years']}</div>
393
+ </div>
394
+ <div class="metric-container">
395
+ <div class="metric-label">Unique Files</div>
396
+ <div class="metric-value">{stats['unique_filenames']}</div>
397
+ </div>
398
+ </div>
399
+ """, unsafe_allow_html=True)
400
 
401
+ # Charts - three columns to include Districts
402
+ col1, col2, col3 = st.columns(3)
403
 
404
  with col1:
405
  # Source distribution chart
 
448
  st.plotly_chart(fig_year, use_container_width=True)
449
  else:
450
  st.info("No valid years found in the results")
451
+
452
+ with col3:
453
+ # District distribution chart
454
+ if stats.get('district_distribution'):
455
+ district_dist_filtered = {k: v for k, v in stats['district_distribution'].items() if k != 'Unknown'}
456
+ if district_dist_filtered:
457
+ district_df = pd.DataFrame(
458
+ list(district_dist_filtered.items()),
459
+ columns=['District', 'Count']
460
+ )
461
+ district_df = district_df.sort_values('Count', ascending=False)
462
+
463
+ fig_district = px.bar(
464
+ district_df,
465
+ x='Count',
466
+ y='District',
467
+ orientation='h',
468
+ title='Distribution by District',
469
+ color='Count',
470
+ color_continuous_scale='blues'
471
+ )
472
+ fig_district.update_layout(height=400, showlegend=False)
473
+ st.plotly_chart(fig_district, use_container_width=True)
474
+ else:
475
+ st.info("No valid districts found in the results")
476
+
477
+ # Close the container
478
+ st.markdown('</div>', unsafe_allow_html=True)
479
 
480
+ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieval Distribution"):
481
  """Display statistics as tables for smaller results with fixed alignment."""
482
  if not stats or stats.get('total_chunks', 0) == 0:
483
  return
484
 
485
+ # Wrap in styled container
486
+ st.markdown('<div class="retrieval-distribution-container">', unsafe_allow_html=True)
487
+
488
  st.subheader(f"πŸ“Š {title}")
489
 
490
  # Create a container with fixed height for alignment
 
495
  col1, col2, col3, col4 = st.columns(4)
496
 
497
  with col1:
498
+ st.markdown("**🏘️ Districts**")
499
+ if stats.get('district_distribution'):
500
+ district_dist_filtered = {k: v for k, v in stats['district_distribution'].items() if k != 'Unknown'}
501
+ if district_dist_filtered:
502
+ district_data = {
503
+ "District": list(district_dist_filtered.keys()),
504
+ "Count": list(district_dist_filtered.values())
505
+ }
506
+ district_df = pd.DataFrame(district_data).sort_values('Count', ascending=False)
507
+ st.dataframe(district_df, hide_index=True, use_container_width=True)
508
+ else:
509
+ st.write("No district data")
510
+ else:
511
+ st.write("No district data")
512
 
513
  with col2:
514
  st.markdown("**πŸ“‚ Sources**")
 
556
  st.dataframe(file_df, hide_index=True, use_container_width=True)
557
  else:
558
  st.write("No file data")
559
+
560
+ # Close container
561
+ st.markdown('</div>', unsafe_allow_html=True)
562
 
563
  @st.cache_data
564
  def load_filter_options():
 
632
  help="Choose specific reports to search. When enabled, all other filters are ignored."
633
  )
634
  st.markdown('</div>', unsafe_allow_html=True)
635
+
636
+ st.markdown('---')
637
 
638
  # Determine if filename filter is active
639
  filename_mode = len(selected_filenames) > 0
640
  # Sources filter
641
+ # st.markdown('<div class="filter-section">', unsafe_allow_html=True)
642
  st.markdown('<div class="filter-title">πŸ“Š Sources</div>', unsafe_allow_html=True)
643
  selected_sources = st.multiselect(
644
  "Select sources:",
 
651
  st.markdown('</div>', unsafe_allow_html=True)
652
 
653
  # Years filter
654
+ # st.markdown('<div class="filter-section">', unsafe_allow_html=True)
655
  st.markdown('<div class="filter-title">πŸ“… Years</div>', unsafe_allow_html=True)
656
  selected_years = st.multiselect(
657
  "Select years:",
 
664
  st.markdown('</div>', unsafe_allow_html=True)
665
 
666
  # Districts filter
667
+ # st.markdown('<div class="filter-section">', unsafe_allow_html=True)
668
  st.markdown('<div class="filter-title">🏘️ Districts</div>', unsafe_allow_html=True)
669
  selected_districts = st.multiselect(
670
  "Select districts:",
 
909
 
910
  # Show charts for 10+ results, tables for fewer
911
  if len(sources) >= 10:
912
+ display_chunk_statistics_charts(stats, "Retrieval Statistics")
913
+ # Also show tables below charts for detailed view
914
+ st.markdown("---")
915
+ display_chunk_statistics_table(stats, "Retrieval Distribution")
916
  else:
917
+ display_chunk_statistics_table(stats, "Retrieval Distribution")
918
 
919
  st.markdown("---")
920
  st.markdown("### πŸ“„ Document Details")