Spaces:

akryldigital
/

audit_assistant

Running

App Files Files Community

Ara Yeroyan commited on Oct 31, 2025

Commit

5262a14

1 Parent(s): 02d7f4f

add retrieval visualisations

Browse files

Files changed (1) hide show

app.py +216 -3

app.py CHANGED Viewed

@@ -10,10 +10,13 @@ import uuid
 import logging
 import traceback
 from pathlib import Path
 import streamlit as st
 from langchain_core.messages import HumanMessage, AIMessage
 from multi_agent_chatbot import get_multi_agent_chatbot
 from smart_chatbot import get_chatbot as get_smart_chatbot
@@ -273,6 +276,203 @@ def serialize_documents(sources):
     return serialized
 @st.cache_data
 def load_filter_options():
     try:
@@ -607,14 +807,27 @@ def main():
                 # Count unique filenames
                 unique_filenames = set()
                 for doc in sources:
-                    filename = getattr(doc, 'metadata', {}).get('filename', 'Unknown')
                     unique_filenames.add(filename)
                 st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top 20):**")
                 if len(unique_filenames) < len(sources):
                     st.info(f"💡 **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
-                for i, doc in enumerate(sources):  # Show top 10
                     # Get relevance score and ID if available
                     metadata = getattr(doc, 'metadata', {})
                     score = metadata.get('reranked_score', metadata.get('original_score', None))

 import logging
 import traceback
 from pathlib import Path
+from typing import List, Dict, Any
+from collections import Counter
 import streamlit as st
 from langchain_core.messages import HumanMessage, AIMessage
+import pandas as pd
+import plotly.express as px
 from multi_agent_chatbot import get_multi_agent_chatbot
 from smart_chatbot import get_chatbot as get_smart_chatbot
     return serialized
+def extract_chunk_statistics(sources: List[Any]) -> Dict[str, Any]:
+    """Extract statistics from retrieved chunks."""
+    if not sources:
+        return {}
+    sources_list = []
+    years = []
+    filenames = []
+    for doc in sources:
+        metadata = getattr(doc, 'metadata', {})
+        # Extract source
+        source = metadata.get('source', 'Unknown')
+        sources_list.append(source)
+        # Extract year
+        year = metadata.get('year', 'Unknown')
+        if year and year != 'Unknown':
+            try:
+                # Convert to int first, then back to string to ensure it's a proper year
+                year_int = int(float(year))  # Handle both int and float strings
+                if 1900 <= year_int <= 2030:  # Reasonable year range
+                    years.append(str(year_int))
+                else:
+                    years.append('Unknown')
+            except (ValueError, TypeError):
+                years.append('Unknown')
+        else:
+            years.append('Unknown')
+        # Extract filename
+        filename = metadata.get('filename', 'Unknown')
+        filenames.append(filename)
+    # Count occurrences
+    source_counts = Counter(sources_list)
+    year_counts = Counter(years)
+    filename_counts = Counter(filenames)
+    return {
+        'total_chunks': len(sources),
+        'unique_sources': len(source_counts),
+        'unique_years': len([y for y in year_counts.keys() if y != 'Unknown']),
+        'unique_filenames': len(filename_counts),
+        'source_distribution': dict(source_counts),
+        'year_distribution': dict(year_counts),
+        'filename_distribution': dict(filename_counts),
+        'sources': sources_list,
+        'years': years,
+        'filenames': filenames
+    }
+def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retrieved Chunks Statistics"):
+    """Display statistics as interactive charts for 10+ results."""
+    if not stats or stats.get('total_chunks', 0) == 0:
+        return
+    st.subheader(f"📊 {title}")
+    # Summary metrics
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("Total Chunks", stats['total_chunks'])
+    with col2:
+        st.metric("Unique Sources", stats['unique_sources'])
+    with col3:
+        st.metric("Unique Years", stats['unique_years'])
+    with col4:
+        st.metric("Unique Files", stats['unique_filenames'])
+    # Charts side by side
+    col1, col2 = st.columns(2)
+    with col1:
+        # Source distribution chart
+        if stats['source_distribution']:
+            source_df = pd.DataFrame(
+                list(stats['source_distribution'].items()),
+                columns=['Source', 'Count']
+            )
+            fig_source = px.bar(
+                source_df,
+                x='Count',
+                y='Source',
+                orientation='h',
+                title='Distribution by Source',
+                color='Count',
+                color_continuous_scale='viridis'
+            )
+            fig_source.update_layout(height=400, showlegend=False)
+            st.plotly_chart(fig_source, use_container_width=True)
+    with col2:
+        # Year distribution chart
+        if stats['year_distribution']:
+            # Filter out 'Unknown' years for the chart
+            year_dist_filtered = {k: v for k, v in stats['year_distribution'].items() if k != 'Unknown'}
+            if year_dist_filtered:
+                year_df = pd.DataFrame(
+                    list(year_dist_filtered.items()),
+                    columns=['Year', 'Count']
+                )
+                # Sort by year as integer but keep as string for categorical display
+                year_df['Year_Int'] = year_df['Year'].astype(int)
+                year_df = year_df.sort_values('Year_Int').drop('Year_Int', axis=1)
+                fig_year = px.bar(
+                    year_df,
+                    x='Year',
+                    y='Count',
+                    title='Distribution by Year',
+                    color='Count',
+                    color_continuous_scale='plasma'
+                )
+                # Ensure years are treated as categorical (discrete) not continuous
+                fig_year.update_xaxes(type='category')
+                fig_year.update_layout(height=400, showlegend=False)
+                st.plotly_chart(fig_year, use_container_width=True)
+            else:
+                st.info("No valid years found in the results")
+def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieved Chunks Statistics"):
+    """Display statistics as tables for smaller results with fixed alignment."""
+    if not stats or stats.get('total_chunks', 0) == 0:
+        return
+    st.subheader(f"📊 {title}")
+    # Create a container with fixed height for alignment
+    stats_container = st.container()
+    with stats_container:
+        # Create 4 equal columns for consistent alignment
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.markdown("**📈 Summary**")
+            summary_data = {
+                "Metric": ["Total", "Sources", "Years", "Files"],
+                "Count": [
+                    stats['total_chunks'],
+                    stats['unique_sources'],
+                    stats['unique_years'],
+                    stats['unique_filenames']
+                ]
+            }
+            summary_df = pd.DataFrame(summary_data)
+            st.dataframe(summary_df, hide_index=True, use_container_width=True)
+        with col2:
+            st.markdown("**📂 Sources**")
+            if stats['source_distribution']:
+                source_data = {
+                    "Source": list(stats['source_distribution'].keys()),
+                    "Count": list(stats['source_distribution'].values())
+                }
+                source_df = pd.DataFrame(source_data).sort_values('Count', ascending=False)
+                st.dataframe(source_df, hide_index=True, use_container_width=True)
+            else:
+                st.write("No source data")
+        with col3:
+            st.markdown("**📅 Years**")
+            if stats['year_distribution']:
+                year_dist_filtered = {k: v for k, v in stats['year_distribution'].items() if k != 'Unknown'}
+                if year_dist_filtered:
+                    year_data = {
+                        "Year": list(year_dist_filtered.keys()),
+                        "Count": list(year_dist_filtered.values())
+                    }
+                    year_df = pd.DataFrame(year_data)
+                    # Sort by year as integer but display as string
+                    year_df['Year_Int'] = year_df['Year'].astype(int)
+                    year_df = year_df.sort_values('Year_Int')[['Year', 'Count']]
+                    st.dataframe(year_df, hide_index=True, use_container_width=True)
+                else:
+                    st.write("No year data")
+            else:
+                st.write("No year data")
+        with col4:
+            st.markdown("**📄 Files**")
+            if stats['filename_distribution']:
+                filename_items = list(stats['filename_distribution'].items())
+                filename_items.sort(key=lambda x: x[1], reverse=True)
+                # Show top files with truncated names
+                file_data = {
+                    "File": [f[:30] + "..." if len(f) > 30 else f for f, c in filename_items[:5]],
+                    "Count": [c for f, c in filename_items[:5]]
+                }
+                file_df = pd.DataFrame(file_data)
+                st.dataframe(file_df, hide_index=True, use_container_width=True)
+            else:
+                st.write("No file data")
 @st.cache_data
 def load_filter_options():
     try:
                 # Count unique filenames
                 unique_filenames = set()
                 for doc in sources:
+                    metadata = getattr(doc, 'metadata', {})
+                    filename = metadata.get('filename', 'Unknown')
                     unique_filenames.add(filename)
                 st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top 20):**")
                 if len(unique_filenames) < len(sources):
                     st.info(f"💡 **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
+                # Extract and display statistics
+                stats = extract_chunk_statistics(sources)
+                # Show charts for 10+ results, tables for fewer
+                if len(sources) >= 10:
+                    display_chunk_statistics_charts(stats, "Retrieved Documents Statistics")
+                else:
+                    display_chunk_statistics_table(stats, "Retrieved Documents Statistics")
+                st.markdown("---")
+                st.markdown("### 📄 Document Details")
+                for i, doc in enumerate(sources):  # Show all documents
                     # Get relevance score and ID if available
                     metadata = getattr(doc, 'metadata', {})
                     score = metadata.get('reranked_score', metadata.get('original_score', None))