""" UI utility functions for data processing and statistics """ from typing import Dict, Any, List from collections import Counter def extract_chunk_statistics(sources: List[Any]) -> Dict[str, Any]: """Extract statistics from retrieved chunks.""" if not sources: return {} sources_list = [] years = [] filenames = [] districts = [] for doc in sources: metadata = getattr(doc, 'metadata', {}) # Extract source source = metadata.get('source', 'Unknown') sources_list.append(source) # Extract year year = metadata.get('year', 'Unknown') if year and year != 'Unknown': try: # Convert to int first, then back to string to ensure it's a proper year year_int = int(float(year)) # Handle both int and float strings if 1900 <= year_int <= 2030: # Reasonable year range years.append(str(year_int)) else: years.append('Unknown') except (ValueError, TypeError): years.append('Unknown') else: years.append('Unknown') # Extract filename filename = metadata.get('filename', 'Unknown') filenames.append(filename) # Extract district district = metadata.get('district', 'Unknown') if district and district != 'Unknown': districts.append(district) else: districts.append('Unknown') # Count occurrences source_counts = Counter(sources_list) year_counts = Counter(years) filename_counts = Counter(filenames) district_counts = Counter(districts) return { 'total_chunks': len(sources), 'unique_sources': len(source_counts), 'unique_years': len([y for y in year_counts.keys() if y != 'Unknown']), 'unique_filenames': len(filename_counts), 'unique_districts': len([d for d in district_counts.keys() if d != 'Unknown']), 'source_distribution': dict(source_counts), 'year_distribution': dict(year_counts), 'filename_distribution': dict(filename_counts), 'district_distribution': dict(district_counts), 'sources': sources_list, 'years': years, 'filenames': filenames, 'districts': districts }