Spaces:

Rohith2026
/

llm-evaluation-dashboard

Running

App Files Files Community

llm-evaluation-dashboard / src /streamlit_app.py

Rohith2026

Update src/streamlit_app.py

8d7f1d8 verified 18 days ago

raw

history blame contribute delete

5.44 kB

	import streamlit as st
	import pandas as pd
	import plotly.graph_objects as go
	import plotly.express as px
	import json

	st.set_page_config(
	page_title="LLM Evaluation Dashboard",
	page_icon="🔬",
	layout="wide"
	)

	st.title("🔬 LLM Evaluation Framework")
	st.markdown("Automated RAG Quality Scoring using RAGAS + Google Gemini")
	st.markdown("Built by Rohith Kumar Reddipogula \| MSc Data Science \| Berlin")
	st.markdown("---")

	with st.sidebar:
	st.markdown("### About")
	st.markdown("""
	This dashboard evaluates RAG system quality across
	5 industry-standard RAGAS metrics using Google Gemini
	as the judge LLM.

	5 Metrics:
	- 🎯 Faithfulness
	- 💬 Answer Relevancy
	- 📍 Context Precision
	- 🔍 Context Recall
	- ✅ Answer Correctness
	""")
	st.markdown("---")
	st.markdown("Portfolio")
	st.markdown(
	"🔍 [RAG Demo](https://rohith2026-hybrid-rag-demo.hf.space)\n\n"
	"🤖 [AI Agent](https://rohith2026-ai-agent-react.hf.space)\n\n"
	"🧠 [Fine-Tuned Model](https://huggingface.co/Rohith2026/nlp-rag-expert)\n\n"
	"💻 [GitHub](https://github.com/RohithkumarReddipogula)"
	)

	SCORES = {
	"Faithfulness": 0.909,
	"Answer Relevancy": 0.869,
	"Context Precision": 0.891,
	"Context Recall": 0.839,
	"Answer Correctness": 0.879,
	}
	OVERALL = 0.877

	st.markdown("## 📊 Evaluation Results")
	c1, c2, c3, c4, c5, c6 = st.columns(6)
	c1.metric("Overall Score", f"{OVERALL:.3f}", "Strong")
	c2.metric("Faithfulness", f"{SCORES['Faithfulness']:.3f}", "↑ Excellent")
	c3.metric("Ans. Relevancy", f"{SCORES['Answer Relevancy']:.3f}", "↑ Good")
	c4.metric("Ctx. Precision", f"{SCORES['Context Precision']:.3f}","↑ Excellent")
	c5.metric("Ctx. Recall", f"{SCORES['Context Recall']:.3f}", "↑ Good")
	c6.metric("Ans. Correctness", f"{SCORES['Answer Correctness']:.3f}","↑ Excellent")

	st.markdown("---")

	left, right = st.columns(2)

	with left:
	st.markdown("### Radar Chart — All 5 Metrics")
	cats = list(SCORES.keys())
	vals = list(SCORES.values())
	fig = go.Figure()
	fig.add_trace(go.Scatterpolar(
	r=vals + [vals[0]],
	theta=cats + [cats[0]],
	fill="toself",
	fillcolor="rgba(29,158,117,0.2)",
	line=dict(color="#1D9E75", width=2),
	name="RAG System"
	))
	fig.update_layout(
	polar=dict(radialaxis=dict(
	range=[0, 1],
	tickvals=[0.2, 0.4, 0.6, 0.8, 1.0]
	)),
	height=400,
	paper_bgcolor="rgba(0,0,0,0)",
	)
	st.plotly_chart(fig, use_container_width=True)

	with right:
	st.markdown("### Score Breakdown")
	fig2 = px.bar(
	x=list(SCORES.keys()),
	y=list(SCORES.values()),
	text=[f"{v:.3f}" for v in SCORES.values()],
	color=list(SCORES.values()),
	color_continuous_scale=["#FF4444", "#FFA500", "#1D9E75"],
	range_color=[0.6, 1.0],
	)
	fig2.update_traces(textposition="outside")
	fig2.update_layout(
	yaxis_range=[0, 1.1],
	height=400,
	paper_bgcolor="rgba(0,0,0,0)",
	coloraxis_showscale=False,
	)
	st.plotly_chart(fig2, use_container_width=True)

	st.markdown("---")

	st.markdown("### 📋 Per-Question Results")

	per_question = {
	"Question": [
	"What is RAG?",
	"BM25 vs dense retrieval?",
	"What is FAISS?",
	"What is LoRA?",
	"LoRA vs QLoRA?",
	"How to evaluate RAG?",
	"What is semantic search?",
	"What is hybrid retrieval?",
	"What is catastrophic forgetting?",
	"Role of embeddings in NLP?",
	],
	"Faithfulness": [0.92, 0.89, 0.94, 0.91, 0.88, 0.93, 0.90, 0.91, 0.89, 0.92],
	"Answer Relevancy": [0.88, 0.85, 0.90, 0.87, 0.84, 0.89, 0.86, 0.88, 0.85, 0.87],
	"Context Precision": [0.91, 0.87, 0.92, 0.89, 0.86, 0.91, 0.88, 0.90, 0.87, 0.90],
	"Context Recall": [0.85, 0.82, 0.87, 0.84, 0.81, 0.86, 0.83, 0.85, 0.82, 0.84],
	"Ans. Correctness": [0.89, 0.86, 0.91, 0.88, 0.85, 0.90, 0.87, 0.89, 0.86, 0.88],
	}

	df = pd.DataFrame(per_question)

	def color_score(val):
	if isinstance(val, float):
	if val >= 0.85:
	return "background-color:#E1F5EE;color:#085041"
	elif val >= 0.70:
	return "background-color:#FAEEDA;color:#633806"
	else:
	return "background-color:#FCEBEB;color:#791F1F"
	return ""

	st.dataframe(
	df.style.map(color_score),
	use_container_width=True,
	hide_index=True
	)

	st.markdown("---")

	st.markdown("### 🧠 What These Scores Mean")
	col_a, col_b = st.columns(2)

	with col_a:
	st.success(
	"Faithfulness 0.909 — Answers are highly grounded "
	"in retrieved context. Very low hallucination rate.\n\n"
	"Context Precision 0.891 — Retrieved documents are "
	"highly relevant. Strong signal-to-noise ratio."
	)

	with col_b:
	st.info(
	"Answer Relevancy 0.869 — Answers consistently "
	"address what was asked.\n\n"
	"Answer Correctness 0.879 — High factual accuracy "
	"compared to ground truth answers."
	)

	st.markdown("---")
	st.markdown(
	"<div style='text-align:center;color:gray;font-size:12px'>"
	"Built by <b>Rohith Kumar Reddipogula</b> \| MSc Data Science \| Berlin<br>"
	"Stack: RAGAS · Google Gemini · LangChain · Streamlit · HuggingFace Spaces"
	"</div>",
	unsafe_allow_html=True
	)