Spaces:
Sleeping
Sleeping
| """ | |
| UI utility functions for data processing and statistics | |
| """ | |
| from typing import Dict, Any, List | |
| from collections import Counter | |
| def extract_chunk_statistics(sources: List[Any]) -> Dict[str, Any]: | |
| """Extract statistics from retrieved chunks.""" | |
| if not sources: | |
| return {} | |
| sources_list = [] | |
| years = [] | |
| filenames = [] | |
| districts = [] | |
| for doc in sources: | |
| metadata = getattr(doc, 'metadata', {}) | |
| # Extract source | |
| source = metadata.get('source', 'Unknown') | |
| sources_list.append(source) | |
| # Extract year | |
| year = metadata.get('year', 'Unknown') | |
| if year and year != 'Unknown': | |
| try: | |
| # Convert to int first, then back to string to ensure it's a proper year | |
| year_int = int(float(year)) # Handle both int and float strings | |
| if 1900 <= year_int <= 2030: # Reasonable year range | |
| years.append(str(year_int)) | |
| else: | |
| years.append('Unknown') | |
| except (ValueError, TypeError): | |
| years.append('Unknown') | |
| else: | |
| years.append('Unknown') | |
| # Extract filename | |
| filename = metadata.get('filename', 'Unknown') | |
| filenames.append(filename) | |
| # Extract district | |
| district = metadata.get('district', 'Unknown') | |
| if district and district != 'Unknown': | |
| districts.append(district) | |
| else: | |
| districts.append('Unknown') | |
| # Count occurrences | |
| source_counts = Counter(sources_list) | |
| year_counts = Counter(years) | |
| filename_counts = Counter(filenames) | |
| district_counts = Counter(districts) | |
| return { | |
| 'total_chunks': len(sources), | |
| 'unique_sources': len(source_counts), | |
| 'unique_years': len([y for y in year_counts.keys() if y != 'Unknown']), | |
| 'unique_filenames': len(filename_counts), | |
| 'unique_districts': len([d for d in district_counts.keys() if d != 'Unknown']), | |
| 'source_distribution': dict(source_counts), | |
| 'year_distribution': dict(year_counts), | |
| 'filename_distribution': dict(filename_counts), | |
| 'district_distribution': dict(district_counts), | |
| 'sources': sources_list, | |
| 'years': years, | |
| 'filenames': filenames, | |
| 'districts': districts | |
| } | |