Spaces:

matis35
/

FFGEN-Demo

Sleeping

Matis Codjia

Fix: analystics

c6390d0 4 months ago

9.64 kB

	"""
	Statistics Dashboard
	Displays metrics for the cache system
	"""

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from datetime import datetime, timedelta
	import sys
	from pathlib import Path

	# Add parent directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from stats_logger import StatsLogger

	# ==========================================
	# PAGE CONFIG
	# ==========================================
	st.set_page_config(
	page_title="Cache Statistics",
	layout="wide"
	)

	st.title("Cache Performance Statistics")

	# ==========================================
	# LOAD DATA
	# ==========================================

	logger = StatsLogger()

	# Load data
	stats = logger.read_stats()
	summary = logger.get_summary_stats()
	cache_misses = logger.read_cache_misses()

	if not stats:
	st.warning("No data yet. Submit some queries first!")
	st.stop()

	# Convert to DataFrame
	df = pd.DataFrame(stats)

	# Convert timestamp to datetime
	if 'timestamp' in df.columns:
	df['timestamp'] = pd.to_datetime(df['timestamp'])
	df = df.sort_values('timestamp')

	# ==========================================
	# COLOR MAPPING (Granularité)
	# ==========================================
	COLOR_MAP = {
	'perfect_match': '#064e3b', # Vert très foncé (Exact)
	'code_hit': '#10b981', # Vert standard (Code proche)
	'feedback_hit': '#34d399', # Vert clair (Sémantique)
	'miss': '#ef4444', # Rouge
	# Rétro-compatibilité
	'hit': '#34d399',
	'semantic hit': '#34d399'
	}

	# ==========================================
	# KPI METRICS
	# ==========================================

	st.header("Key Performance Indicators")

	# Ligne 1 : KPIs Généraux
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric(
	"Total Queries",
	f"{summary['total_queries']:,}",
	help="Total number of queries submitted"
	)

	with col2:
	st.metric(
	"Global Hit Rate",
	f"{summary['hit_rate']:.1f}%",
	delta=f"{summary['total_hits']} hits",
	help="Percentage of queries resolved via cache (All types)"
	)

	with col3:
	st.metric(
	"Avg Confidence",
	f"{summary['avg_confidence']:.2f}",
	help="Average confidence score for cache hits"
	)

	with col4:
	st.metric(
	"DeepSeek Tokens",
	f"{summary['total_deepseek_tokens']:,}",
	delta=f"{summary['total_misses']} calls",
	delta_color="inverse",
	help="Total tokens consumed via DeepSeek API"
	)

	# Ligne 2 : Breakdown des Hits (Nouveau)
	if 'breakdown' in summary and summary['total_hits'] > 0:
	st.markdown("### 🎯 Hit Breakdown")
	b_col1, b_col2, b_col3, b_col4 = st.columns(4)

	bd = summary['breakdown']

	with b_col1:
	st.metric("✨ Perfect Matches", bd.get('perfect_match', 0), help="Exact string matches")
	with b_col2:
	st.metric("💻 Code Hits", bd.get('code_hit', 0), help="High code similarity matches")
	with b_col3:
	st.metric("🧠 Feedback Hits", bd.get('feedback_hit', 0), help="Semantic vector matches")
	with b_col4:
	# Petit Pie Chart pour visualiser la répartition des hits
	labels = ['Perfect', 'Code', 'Feedback']
	values = [bd.get('perfect_match', 0), bd.get('code_hit', 0), bd.get('feedback_hit', 0)]
	colors = [COLOR_MAP['perfect_match'], COLOR_MAP['code_hit'], COLOR_MAP['feedback_hit']]

	fig_pie = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4, marker=dict(colors=colors))])
	fig_pie.update_layout(margin=dict(t=0, b=0, l=0, r=0), height=100, showlegend=False)
	st.plotly_chart(fig_pie, use_container_width=True)

	st.divider()

	# ==========================================
	# TIME SERIES
	# ==========================================

	st.header("Query Timeline")

	col1, col2 = st.columns(2)

	with col1:
	# Hit/Miss over time
	fig = px.scatter(
	df,
	x='timestamp',
	y='confidence',
	color='status',
	size='response_time_ms',
	color_discrete_map=COLOR_MAP,
	title="Cache Performance Over Time",
	labels={
	'timestamp': 'Time',
	'confidence': 'Confidence Score',
	'status': 'Result Type',
	'response_time_ms': 'Response Time (ms)'
	}
	)
	fig.update_layout(height=400)
	st.plotly_chart(fig, use_container_width=True)

	with col2:
	# Response time distribution
	fig = px.box(
	df,
	x='status',
	y='response_time_ms',
	color='status',
	color_discrete_map=COLOR_MAP,
	title="Response Time Distribution by Type",
	labels={'response_time_ms': 'Response Time (ms)', 'status': 'Result Type'}
	)
	fig.update_layout(height=400)
	st.plotly_chart(fig, use_container_width=True)

	# ==========================================
	# SIMILARITY SCORES
	# ==========================================

	st.header("Similarity Analysis")

	col1, col2 = st.columns(2)

	with col1:
	# Similarity distribution
	if 'similarity_score' in df.columns:
	fig = px.histogram(
	df,
	x='similarity_score',
	color='status',
	nbins=30,
	title="Similarity Score Distribution",
	labels={'similarity_score': 'Similarity Score (lower = more similar)'},
	color_discrete_map=COLOR_MAP
	)
	# Ajout d'une ligne indicative pour le threshold moyen (si applicable)
	fig.add_vline(x=0.3, line_dash="dash", line_color="orange", annotation_text="Typ. Threshold")
	fig.update_layout(height=400)
	st.plotly_chart(fig, use_container_width=True)

	with col2:
	# Confidence vs Similarity
	# On filtre tout ce qui n'est pas un miss
	hits_df = df[df['status'] != 'miss']

	if not hits_df.empty and 'similarity_score' in hits_df.columns:
	fig = px.scatter(
	hits_df,
	x='similarity_score',
	y='confidence',
	color='status', # On colore par type de hit pour voir la distinction
	size='response_time_ms',
	color_discrete_map=COLOR_MAP,
	title="Confidence vs Similarity (Hits Only)",
	labels={
	'similarity_score': 'Similarity Score',
	'confidence': 'Confidence',
	'response_time_ms': 'Response Time (ms)'
	}
	)
	fig.update_layout(height=400)
	st.plotly_chart(fig, use_container_width=True)

	# ==========================================
	# ERROR CATEGORIES
	# ==========================================

	st.header("Error Categories Analysis")

	col1, col2 = st.columns(2)

	with col1:
	# Top error categories
	if 'error_category' in df.columns:
	error_counts = df['error_category'].value_counts().head(10)
	fig = px.bar(
	x=error_counts.values,
	y=error_counts.index,
	orientation='h',
	title="Top 10 Error Categories",
	labels={'x': 'Count', 'y': 'Error Category'},
	color=error_counts.values,
	color_continuous_scale='blues'
	)
	fig.update_layout(height=400, showlegend=False)
	st.plotly_chart(fig, use_container_width=True)

	with col2:
	# Difficulty distribution
	if 'difficulty' in df.columns:
	diff_counts = df['difficulty'].value_counts()
	fig = px.pie(
	values=diff_counts.values,
	names=diff_counts.index,
	title="Difficulty Distribution",
	color_discrete_sequence=px.colors.sequential.RdBu
	)
	fig.update_layout(height=400)
	st.plotly_chart(fig, use_container_width=True)

	# ==========================================
	# CACHE MISSES LOG
	# ==========================================

	st.header("Recent Cache Misses")

	if cache_misses:
	st.info(f"{len(cache_misses)} cache misses logged (ready for retraining)")

	# Display the last 5
	recent_misses = cache_misses[-5:]

	for i, miss in enumerate(reversed(recent_misses), 1):
	with st.expander(f"Miss #{len(cache_misses) - i + 1} - {miss.get('theme', 'N/A')} ({miss.get('error_category', 'N/A')})"):
	col1, col2 = st.columns([1, 1])

	with col1:
	st.markdown("Code:")
	st.code(miss.get('code', 'N/A'), language='c')

	with col2:
	st.markdown("Generated Feedback:")
	st.write(miss.get('feedback', 'N/A'))

	st.markdown(f"Tokens Used: {miss.get('tokens_used', 0)}")
	st.markdown(f"Timestamp: {miss.get('timestamp', 'N/A')}")
	else:
	st.success("No cache misses yet - all queries resolved from cache!")

	# ==========================================
	# EXPORT DATA
	# ==========================================

	st.divider()

	st.header("Export Data")

	col1, col2 = st.columns(2)

	with col1:
	if st.button("Download Stats CSV"):
	csv = df.to_csv(index=False)
	st.download_button(
	label="Download stats.csv",
	data=csv,
	file_name=f"cache_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
	mime="text/csv"
	)

	with col2:
	if cache_misses and st.button("Download Cache Misses JSONL"):
	import json
	jsonl_content = "\n".join(json.dumps(miss) for miss in cache_misses)
	st.download_button(
	label="Download cache_miss.jsonl",
	data=jsonl_content,
	file_name=f"cache_miss_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl",
	mime="application/jsonl"
	)