akryldigital's picture
Gemini FSA (#6)
8d898c4 verified
"""
UI utility functions for data processing and statistics
"""
from typing import Dict, Any, List
from collections import Counter
def extract_chunk_statistics(sources: List[Any]) -> Dict[str, Any]:
"""Extract statistics from retrieved chunks."""
if not sources:
return {}
sources_list = []
years = []
filenames = []
districts = []
for doc in sources:
metadata = getattr(doc, 'metadata', {})
# Extract source
source = metadata.get('source', 'Unknown')
sources_list.append(source)
# Extract year
year = metadata.get('year', 'Unknown')
if year and year != 'Unknown':
try:
# Convert to int first, then back to string to ensure it's a proper year
year_int = int(float(year)) # Handle both int and float strings
if 1900 <= year_int <= 2030: # Reasonable year range
years.append(str(year_int))
else:
years.append('Unknown')
except (ValueError, TypeError):
years.append('Unknown')
else:
years.append('Unknown')
# Extract filename
filename = metadata.get('filename', 'Unknown')
filenames.append(filename)
# Extract district
district = metadata.get('district', 'Unknown')
if district and district != 'Unknown':
districts.append(district)
else:
districts.append('Unknown')
# Count occurrences
source_counts = Counter(sources_list)
year_counts = Counter(years)
filename_counts = Counter(filenames)
district_counts = Counter(districts)
return {
'total_chunks': len(sources),
'unique_sources': len(source_counts),
'unique_years': len([y for y in year_counts.keys() if y != 'Unknown']),
'unique_filenames': len(filename_counts),
'unique_districts': len([d for d in district_counts.keys() if d != 'Unknown']),
'source_distribution': dict(source_counts),
'year_distribution': dict(year_counts),
'filename_distribution': dict(filename_counts),
'district_distribution': dict(district_counts),
'sources': sources_list,
'years': years,
'filenames': filenames,
'districts': districts
}