Spaces:

SciKnowOrg
/

OntoLearner-Benchmark-Metrics

Running

App Files Files Community

OntoLearner-Benchmark-Metrics / app.py

hamedbabaeigiglou

Update app.py

727e73e verified 5 months ago

raw

history blame

5.66 kB

	import os
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from scipy.stats import gaussian_kde
	import streamlit as st
	import math

	# =====================
	# STREAMLIT CONFIG
	# =====================
	os.environ["STREAMLIT_CONFIG_DIR"] = os.path.join(os.getcwd(), ".streamlit")
	os.makedirs(os.environ["STREAMLIT_CONFIG_DIR"], exist_ok=True)

	st.set_page_config(page_title="OntoLearner Benchmark", layout="wide")

	st.markdown(
	"""
	<style>
	html, body, .main {
	height: 100%;
	width: 100%;
	margin: 0;
	padding: 0;
	}
	.block-container {
	padding: 1rem;
	margin: 0;
	height: 100%;
	width: 100%;
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	st.title("OntoLearner Benchmark – Ontology Metrics Dashboard")

	# =====================
	# LOAD DATA
	# =====================
	df = pd.read_excel("metrics.xlsx")
	st.subheader("📘 Ontology Metrics Table")
	st.dataframe(df, use_container_width=True)

	# =====================
	# SUMMARY STATISTICS
	# =====================
	st.subheader("📊 Statistical Summary for All Metrics")

	metric_cols = [
	"total_nodes", "num_classes", "num_properties", "num_individuals",
	"avg_depth", "avg_breadth", "Processing Time (s)"
	]

	df_metrics = df[metric_cols]

	summary = df_metrics.describe().T
	summary["missing"] = df_metrics.isnull().sum()
	summary = summary.round(2)

	slight_summary = summary[['mean', 'std', '25%', '50%', '75%', 'max']]

	st.dataframe(slight_summary, use_container_width=True)


	# =====================
	# COMPLEXITY SCORE
	# =====================
	st.subheader("⚙️ Complexity Score Computation")

	metrics = [
	"total_nodes", "total_edges", "num_root_nodes", "num_leaf_nodes", "num_classes",
	"num_properties", "num_individuals", "max_depth", "min_depth", "avg_depth",
	"depth_variance", "max_breadth", "min_breadth", "avg_breadth", "breadth_variance",
	"num_term_types", "num_taxonomic_relations", "num_non_taxonomic_relations",
	"avg_terms",
	]

	graph_metrics = ["total_nodes", "total_edges", "num_root_nodes", "num_leaf_nodes"]
	coverage_metrics = ["num_classes", "num_properties", "num_individuals"]
	hierarchy_metrics = ["max_depth", "min_depth", "avg_depth", "depth_variance"]
	breadth_metrics = ["max_breadth", "min_breadth", "avg_breadth", "breadth_variance"]
	llms4ol_metrics = ["num_term_types", "num_taxonomic_relations", "num_non_taxonomic_relations", "avg_terms"]

	weights = {}
	for c in metrics:
	if c in graph_metrics: weights[c] = 0.3
	elif c in coverage_metrics: weights[c] = 0.25
	elif c in hierarchy_metrics: weights[c] = 0.10
	elif c in breadth_metrics: weights[c] = 0.20
	elif c in llms4ol_metrics: weights[c] = 0.15

	def log_normalize(x):
	return np.log1p(x)

	def complexity_score(onto_metric, a=0.4, b=6.0, eps=1e-12):
	norm_metric = {metric: log_normalize(onto_metric[metric]) for metric in metrics}
	weighted_norm = {m: norm_metric[m] * weights[m] for m in weights}
	c_score = sum(weighted_norm.values()) / sum(weights.values())
	c_score = 1.0 / (1.0 + np.exp(-a * (c_score - b) + eps))
	return c_score

	cs = [complexity_score(dict(row)) for _, row in df.iterrows()]
	df_out = df.copy()
	df_out["complexity_score"] = cs
	df_out["complexity_rank"] = df_out["complexity_score"].rank(method="min", ascending=False).astype(int)

	st.write("The following table represents the ontologies with complexity score and their ranking based on this score.")
	st.dataframe(df_out, use_container_width=True)


	# =====================
	# VISUALIZATION
	# =====================
	st.subheader("📈 Complexity Score Visualizations")

	top_n_val = 15

	fig = plt.figure(figsize=(14, 11))

	# PANEL 1 — TOP N BY COMPLEXITY
	ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2)
	topn = df_out.sort_values("complexity_score", ascending=False).head(top_n_val)

	ax1.barh(topn["Ontology ID"].astype(str), topn["complexity_score"], color="#4C72B0")
	ax1.invert_yaxis()
	ax1.set_title(f"Top {top_n_val} Ontologies by Complexity Score")

	# PANEL 2 — PROCESSING TIME VS COMPLEXITY
	ax2 = plt.subplot2grid((3, 2), (1, 0))
	ax2.scatter(df_out["complexity_score"], df_out["Processing Time (s)"],
	alpha=0.4, s=60, edgecolor="black")

	x = df_out["complexity_score"].values
	y = df_out["Processing Time (s)"].values

	coeffs = np.polyfit(x, y, 4)
	poly = np.poly1d(coeffs)
	xs = np.linspace(x.min(), x.max(), 300)
	ax2.plot(xs, poly(xs))

	ax2.set_title("Processing Time vs Complexity")

	# PANEL 3 — DISTRIBUTION
	ax3 = plt.subplot2grid((3, 2), (1, 1))
	ax3.hist(df_out["complexity_score"], bins=20, edgecolor="black", alpha=0.8, density=True)

	values = df_out["complexity_score"].dropna().values
	kde = gaussian_kde(values)
	xx = np.linspace(values.min(), values.max(), 1000)
	ax3.plot(xx, kde(xx), linewidth=1.5)

	ax3.set_title("Distribution of Complexity Scores")
	plt.tight_layout()

	st.pyplot(fig)


	# =====================
	# CORRELATIONS
	# =====================
	st.subheader("📡 Domain-Wise Correlations")

	domains = sorted(df["Domain"].unique())
	n_domains = len(domains)

	n_rows = 2
	n_cols = math.ceil(n_domains / n_rows)

	fig, axes = plt.subplots(n_rows, int(np.ceil(n_domains / 2)), figsize=(n_cols * 3.5, n_rows * 3.5))
	axes = axes.flatten()

	for i, dom in enumerate(domains):
	sub = df[df["Domain"] == dom][metrics]
	corr = sub.corr()

	sns.heatmap(corr, cmap="coolwarm", square=True, cbar=False, linewidths=0.2,
	xticklabels=False, yticklabels=False, ax=axes[i])
	axes[i].set_title(dom, fontsize=13)

	for j in range(i + 1, len(axes)):
	axes[j].axis("off")

	plt.tight_layout(rect=[0, 0, 1, 1])
	plt.tight_layout()
	st.pyplot(fig)