UAP-Data-Analysis-Tool / utils /enhanced_example.py
Ashoka74's picture
Deploy current work to HF Space (slim)
a1aef88
Raw
History Blame Contribute Delete
14.8 kB
"""
Enhanced filtering and visualization example for UAP Data Analysis Tool
Demonstrates the improved dynamic filtering and visualization pipeline
"""
import streamlit as st
import pandas as pd
import numpy as np
from data_processing import DataProcessor
from visualization import UAP_Visualizer
from session_manager import SessionStateManager
import plotly.graph_objects as go
def main():
"""Main function demonstrating enhanced filtering and visualization"""
st.title("πŸš€ Enhanced UAP Data Analysis Pipeline")
st.markdown("### Dynamic Filtering & Interactive Visualization System")
# Initialize session state
SessionStateManager.initialize()
# Load data with caching
@st.cache_data
def load_sample_data():
"""Load sample data for demonstration"""
try:
# Try to load the actual UAP dataset
df = DataProcessor.load_data('final_ufoseti_dataset.h5', key='df')
st.success(f"βœ… Loaded UAP dataset with {len(df):,} records")
return df
except Exception as e:
st.warning(f"Could not load UAP dataset: {e}")
# Create sample data if real data not available
np.random.seed(42)
n_samples = 10000
sample_data = {
'date': pd.date_range('2020-01-01', periods=n_samples, freq='H'),
'latitude': np.random.uniform(-90, 90, n_samples),
'longitude': np.random.uniform(-180, 180, n_samples),
'duration_minutes': np.random.exponential(10, n_samples),
'shape': np.random.choice(['circle', 'triangle', 'disk', 'light', 'other'], n_samples),
'color': np.random.choice(['white', 'red', 'orange', 'blue', 'green', 'unknown'], n_samples),
'altitude': np.random.uniform(100, 50000, n_samples),
'witnesses': np.random.poisson(2, n_samples) + 1,
'credibility_score': np.random.beta(2, 5, n_samples),
'description_length': np.random.lognormal(3, 1, n_samples).astype(int)
}
df = pd.DataFrame(sample_data)
st.info(f"πŸ“Š Using sample dataset with {len(df):,} records for demonstration")
return df
# Load the data
df = load_sample_data()
# Sidebar for analysis options
with st.sidebar:
st.header("πŸ”§ Analysis Options")
analysis_mode = st.radio(
"Select Analysis Mode",
["Enhanced Filtering", "Interactive Visualizations", "Dashboard View", "Performance Demo"]
)
enable_quick_filters = st.checkbox("Enable Quick Filters", value=False)
enable_advanced_filters = st.checkbox("Enable Advanced Filters", value=True)
max_viz_points = st.slider("Max Visualization Points", 1000, 50000, 10000, step=1000)
# Main content based on selected mode
if analysis_mode == "Enhanced Filtering":
show_enhanced_filtering(df, enable_quick_filters, enable_advanced_filters)
elif analysis_mode == "Interactive Visualizations":
show_interactive_visualizations(df, max_viz_points)
elif analysis_mode == "Dashboard View":
show_dashboard_view(df, max_viz_points)
elif analysis_mode == "Performance Demo":
show_performance_demo(df)
def show_enhanced_filtering(df: pd.DataFrame, enable_quick_filters: bool, enable_advanced_filters: bool):
"""Demonstrate enhanced filtering capabilities"""
st.header("πŸ” Enhanced Dynamic Filtering")
# Show data profile first
with st.expander("πŸ“Š Data Profile Analysis", expanded=True):
profile = DataProcessor.profile_data(df)
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Categorical Columns", len(profile['categorical_columns']))
with col2:
st.metric("Numeric Columns", len(profile['numeric_columns']))
with col3:
st.metric("DateTime Columns", len(profile['datetime_columns']))
with col4:
st.metric("Text Columns", len(profile['text_columns']))
# Show column details
if st.checkbox("Show detailed column analysis"):
st.json(profile)
# Apply enhanced filtering
st.subheader("Apply Filters")
filtered_df = DataProcessor.filter_dataframe_enhanced(
df,
enable_quick_filters=enable_quick_filters,
enable_advanced_filters=enable_advanced_filters
)
# Show filtered results
if len(filtered_df) > 0:
st.subheader("πŸ“‹ Filtered Data Preview")
st.dataframe(filtered_df.head(100), use_container_width=True)
# Export options
col1, col2 = st.columns(2)
with col1:
if st.button("πŸ’Ύ Save Filtered Data"):
# In a real app, you'd save to a file
SessionStateManager.set('last_filtered_data', filtered_df)
st.success("Filtered data saved to session!")
with col2:
csv = filtered_df.to_csv(index=False)
st.download_button(
label="πŸ“₯ Download CSV",
data=csv,
file_name="filtered_uap_data.csv",
mime="text/csv"
)
def show_interactive_visualizations(df: pd.DataFrame, max_points: int):
"""Demonstrate interactive visualization capabilities"""
st.header("πŸ“Š Interactive Visualizations")
# Get numeric and categorical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime_cols = []
for col in df.columns:
if pd.api.types.is_datetime64_any_dtype(df[col]):
datetime_cols.append(col)
# Visualization type selection
viz_type = st.selectbox(
"Select Visualization Type",
["Scatter Plot", "Histogram", "Treemap", "Correlation Matrix", "Time Series"]
)
if viz_type == "Scatter Plot" and len(numeric_cols) >= 2:
col1, col2, col3 = st.columns(3)
with col1:
x_col = st.selectbox("X-axis", numeric_cols, key="scatter_x")
with col2:
y_col = st.selectbox("Y-axis", [col for col in numeric_cols if col != x_col], key="scatter_y")
with col3:
color_col = st.selectbox("Color by", ["None"] + categorical_cols, key="scatter_color")
color_col = None if color_col == "None" else color_col
if st.button("Generate Scatter Plot"):
fig = UAP_Visualizer.plot_interactive_scatter(
df, x_col, y_col, color_col=color_col, max_points=max_points
)
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "Histogram" and len(numeric_cols) > 0:
col1, col2 = st.columns(2)
with col1:
hist_col = st.selectbox("Column to analyze", numeric_cols + categorical_cols, key="hist_col")
with col2:
bins = st.slider("Number of bins", 10, 100, 50, key="hist_bins")
if st.button("Generate Histogram"):
if hist_col in numeric_cols:
fig = UAP_Visualizer.plot_interactive_histogram(df, hist_col, bins=bins)
st.plotly_chart(fig, use_container_width=True)
else:
# For categorical columns, use treemap instead
fig = UAP_Visualizer.plot_interactive_treemap(df, hist_col, top_n=20)
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "Treemap" and len(categorical_cols) > 0:
col1, col2 = st.columns(2)
with col1:
tree_col = st.selectbox("Categorical column", categorical_cols, key="tree_col")
with col2:
top_n = st.slider("Top N categories", 5, 50, 20, key="tree_n")
if st.button("Generate Treemap"):
fig = UAP_Visualizer.plot_interactive_treemap(df, tree_col, top_n=top_n)
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "Correlation Matrix" and len(numeric_cols) >= 2:
col1, col2 = st.columns(2)
with col1:
corr_method = st.selectbox("Correlation method", ["pearson", "spearman", "kendall"], key="corr_method")
with col2:
selected_cols = st.multiselect("Select columns", numeric_cols, default=numeric_cols[:10], key="corr_cols")
if selected_cols and st.button("Generate Correlation Matrix"):
fig = UAP_Visualizer.plot_correlation_matrix(df[selected_cols], method=corr_method)
st.plotly_chart(fig, use_container_width=True)
elif viz_type == "Time Series" and len(datetime_cols) > 0 and len(numeric_cols) > 0:
col1, col2, col3 = st.columns(3)
with col1:
date_col = st.selectbox("Date column", datetime_cols, key="ts_date")
with col2:
value_cols = st.multiselect("Value columns", numeric_cols, default=numeric_cols[:3], key="ts_values")
with col3:
resample_freq = st.selectbox("Resample frequency", ["None", "D", "W", "M"], key="ts_freq")
resample_freq = None if resample_freq == "None" else resample_freq
if value_cols and st.button("Generate Time Series"):
fig = UAP_Visualizer.plot_time_series(df, date_col, value_cols, resample_freq=resample_freq)
st.plotly_chart(fig, use_container_width=True)
def show_dashboard_view(df: pd.DataFrame, max_points: int):
"""Demonstrate dashboard capabilities"""
st.header("πŸ“ˆ Interactive Dashboard")
# Create multiple charts for dashboard
charts = []
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
if len(numeric_cols) >= 2 and len(categorical_cols) >= 1:
with st.spinner("Generating dashboard charts..."):
# Chart 1: Scatter plot
if len(numeric_cols) >= 2:
fig1 = UAP_Visualizer.plot_interactive_scatter(
df, numeric_cols[0], numeric_cols[1],
color_col=categorical_cols[0] if categorical_cols else None,
max_points=max_points//4
)
charts.append(fig1)
# Chart 2: Histogram
if len(numeric_cols) >= 1:
fig2 = UAP_Visualizer.plot_interactive_histogram(df, numeric_cols[0])
charts.append(fig2)
# Chart 3: Treemap
if len(categorical_cols) >= 1:
fig3 = UAP_Visualizer.plot_interactive_treemap(df, categorical_cols[0], top_n=15)
charts.append(fig3)
# Chart 4: Correlation matrix (if enough numeric columns)
if len(numeric_cols) >= 3:
fig4 = UAP_Visualizer.plot_correlation_matrix(df[numeric_cols[:5]])
charts.append(fig4)
# Display individual charts
if len(charts) >= 2:
col1, col2 = st.columns(2)
with col1:
st.plotly_chart(charts[0], use_container_width=True)
if len(charts) >= 3:
st.plotly_chart(charts[2], use_container_width=True)
with col2:
st.plotly_chart(charts[1], use_container_width=True)
if len(charts) >= 4:
st.plotly_chart(charts[3], use_container_width=True)
# Combined dashboard view
if st.button("Generate Combined Dashboard"):
dashboard_fig = UAP_Visualizer.create_dashboard_layout(charts[:4], layout="2x2")
st.plotly_chart(dashboard_fig, use_container_width=True)
else:
st.warning("Not enough numeric or categorical columns for dashboard generation")
def show_performance_demo(df: pd.DataFrame):
"""Demonstrate performance improvements"""
st.header("⚑ Performance Demonstration")
# Performance metrics
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Dataset Size", f"{len(df):,} rows")
with col2:
st.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
with col3:
cache_info = st.session_state.get('cached_visualizations', {})
st.metric("Cached Visualizations", len(cache_info))
# Performance comparison
st.subheader("πŸƒβ€β™‚οΈ Speed Comparison")
if st.button("Run Performance Test"):
import time
# Test data profiling speed
start_time = time.time()
profile = DataProcessor.profile_data(df)
profile_time = time.time() - start_time
# Test visualization generation
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) >= 1:
start_time = time.time()
fig = UAP_Visualizer.plot_interactive_histogram(df, numeric_cols[0])
viz_time = time.time() - start_time
else:
viz_time = 0
# Display results
perf_col1, perf_col2 = st.columns(2)
with perf_col1:
st.metric("Data Profiling", f"{profile_time:.3f} seconds")
with perf_col2:
st.metric("Visualization Generation", f"{viz_time:.3f} seconds")
# Show caching benefits
st.info("πŸš€ Subsequent calls to the same functions will be much faster due to caching!")
# Memory optimization demo
if len(df) > 10000:
st.subheader("πŸ“Š Smart Sampling Demo")
sample_sizes = [1000, 5000, 10000, len(df)]
sample_times = []
for size in sample_sizes:
if size <= len(df):
start_time = time.time()
sampled_df = UAP_Visualizer._smart_sampling(df, max_points=size)
sample_time = time.time() - start_time
sample_times.append(sample_time)
else:
sample_times.append(None)
# Create performance chart
perf_data = {
'Sample Size': [f"{size:,}" for size in sample_sizes if sample_times[sample_sizes.index(size)] is not None],
'Processing Time': [t for t in sample_times if t is not None]
}
perf_df = pd.DataFrame(perf_data)
st.line_chart(perf_df.set_index('Sample Size'))
if __name__ == "__main__":
main()