Spaces:

lambdaofgod
/

paperswithcode_nbow

Runtime error

App Files Files Community

paperswithcode_nbow / pages /2_Statistics.py

lambdaofgod

test metrics for best models

c5a2694 almost 3 years ago

raw

history blame

2.29 kB

	import pandas as pd
	import streamlit as st
	import config
	from pathlib import Path as P
	import json


	nbow_results_path = P("assets").glob("nbow*")

	def display_metrics_dict(metrics, display_only_accuracy):
	model_name = metrics.pop("model_name")
	columns = metrics.pop("columns").split("_")
	st.markdown(f"### columns: {columns}")
	st.markdown(f"best model {model_name}")
	if not display_only_accuracy:
	st.json(metrics)
	else:
	st.json({"accuracy@10": metrics["accuracy@k"]["10"]})

	def display_metrics():
	display_only_accuracy = st.sidebar.checkbox("display only accuracy@10", value=True)
	st.markdown("## Test metrics for best validation modelon given columns")
	for p in nbow_results_path:
	metrics = json.loads(open(p, "r").read())
	display_metrics_dict(metrics, display_only_accuracy)

	display_metrics()

	best_results_df = pd.read_csv(config.best_tasks_path)


	worst_results_df = pd.read_csv(config.worst_tasks_path)

	show_worst_best_statistics = st.sidebar.checkbox(
	label="show worst/best statistics grouped by area"
	)

	show_area_aggregated_results = st.sidebar.checkbox(
	label="show results aggregated by area"
	)
	if show_worst_best_statistics:
	st.markdown(
	"""
	## Worst/best queries
	The following are top 10 worst/best queries per area by number of hits.
	There are at least 10 documents per query in the test set, so number of hits/10 is the accuracy.
	"""
	)
	sort_key = st.selectbox("sort by", list(best_results_df.columns))
	st.markdown("## Queries with best results")
	st.table(best_results_df.sort_values(sort_key, ascending=False))
	st.markdown("## Queries with worst results")
	st.table(worst_results_df.sort_values(sort_key, ascending=False))

	if show_area_aggregated_results:
	st.markdown("## Area aggregated results")
	best_results_agg = best_results_df.groupby("area").agg("mean").reset_index()
	worst_results_agg = worst_results_df.groupby("area").agg("mean").reset_index()
	sort_key = st.selectbox("sort by", list(best_results_agg.columns))
	st.markdown("Best results")
	st.table(best_results_agg.sort_values(sort_key, ascending=False))
	st.markdown("Worst results")
	st.table(worst_results_agg.sort_values(sort_key, ascending=False))