hamedbabaeigiglou's picture
Update app.py
727e73e verified
raw
history blame
5.66 kB
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import gaussian_kde
import streamlit as st
import math
# =====================
# STREAMLIT CONFIG
# =====================
os.environ["STREAMLIT_CONFIG_DIR"] = os.path.join(os.getcwd(), ".streamlit")
os.makedirs(os.environ["STREAMLIT_CONFIG_DIR"], exist_ok=True)
st.set_page_config(page_title="OntoLearner Benchmark", layout="wide")
st.markdown(
"""
<style>
html, body, .main {
height: 100%;
width: 100%;
margin: 0;
padding: 0;
}
.block-container {
padding: 1rem;
margin: 0;
height: 100%;
width: 100%;
}
</style>
""",
unsafe_allow_html=True
)
st.title("OntoLearner Benchmark – Ontology Metrics Dashboard")
# =====================
# LOAD DATA
# =====================
df = pd.read_excel("metrics.xlsx")
st.subheader("πŸ“˜ Ontology Metrics Table")
st.dataframe(df, use_container_width=True)
# =====================
# SUMMARY STATISTICS
# =====================
st.subheader("πŸ“Š Statistical Summary for All Metrics")
metric_cols = [
"total_nodes", "num_classes", "num_properties", "num_individuals",
"avg_depth", "avg_breadth", "Processing Time (s)"
]
df_metrics = df[metric_cols]
summary = df_metrics.describe().T
summary["missing"] = df_metrics.isnull().sum()
summary = summary.round(2)
slight_summary = summary[['mean', 'std', '25%', '50%', '75%', 'max']]
st.dataframe(slight_summary, use_container_width=True)
# =====================
# COMPLEXITY SCORE
# =====================
st.subheader("βš™οΈ Complexity Score Computation")
metrics = [
"total_nodes", "total_edges", "num_root_nodes", "num_leaf_nodes", "num_classes",
"num_properties", "num_individuals", "max_depth", "min_depth", "avg_depth",
"depth_variance", "max_breadth", "min_breadth", "avg_breadth", "breadth_variance",
"num_term_types", "num_taxonomic_relations", "num_non_taxonomic_relations",
"avg_terms",
]
graph_metrics = ["total_nodes", "total_edges", "num_root_nodes", "num_leaf_nodes"]
coverage_metrics = ["num_classes", "num_properties", "num_individuals"]
hierarchy_metrics = ["max_depth", "min_depth", "avg_depth", "depth_variance"]
breadth_metrics = ["max_breadth", "min_breadth", "avg_breadth", "breadth_variance"]
llms4ol_metrics = ["num_term_types", "num_taxonomic_relations", "num_non_taxonomic_relations", "avg_terms"]
weights = {}
for c in metrics:
if c in graph_metrics: weights[c] = 0.3
elif c in coverage_metrics: weights[c] = 0.25
elif c in hierarchy_metrics: weights[c] = 0.10
elif c in breadth_metrics: weights[c] = 0.20
elif c in llms4ol_metrics: weights[c] = 0.15
def log_normalize(x):
return np.log1p(x)
def complexity_score(onto_metric, a=0.4, b=6.0, eps=1e-12):
norm_metric = {metric: log_normalize(onto_metric[metric]) for metric in metrics}
weighted_norm = {m: norm_metric[m] * weights[m] for m in weights}
c_score = sum(weighted_norm.values()) / sum(weights.values())
c_score = 1.0 / (1.0 + np.exp(-a * (c_score - b) + eps))
return c_score
cs = [complexity_score(dict(row)) for _, row in df.iterrows()]
df_out = df.copy()
df_out["complexity_score"] = cs
df_out["complexity_rank"] = df_out["complexity_score"].rank(method="min", ascending=False).astype(int)
st.write("The following table represents the ontologies with complexity score and their ranking based on this score.")
st.dataframe(df_out, use_container_width=True)
# =====================
# VISUALIZATION
# =====================
st.subheader("πŸ“ˆ Complexity Score Visualizations")
top_n_val = 15
fig = plt.figure(figsize=(14, 11))
# PANEL 1 β€” TOP N BY COMPLEXITY
ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2)
topn = df_out.sort_values("complexity_score", ascending=False).head(top_n_val)
ax1.barh(topn["Ontology ID"].astype(str), topn["complexity_score"], color="#4C72B0")
ax1.invert_yaxis()
ax1.set_title(f"Top {top_n_val} Ontologies by Complexity Score")
# PANEL 2 β€” PROCESSING TIME VS COMPLEXITY
ax2 = plt.subplot2grid((3, 2), (1, 0))
ax2.scatter(df_out["complexity_score"], df_out["Processing Time (s)"],
alpha=0.4, s=60, edgecolor="black")
x = df_out["complexity_score"].values
y = df_out["Processing Time (s)"].values
coeffs = np.polyfit(x, y, 4)
poly = np.poly1d(coeffs)
xs = np.linspace(x.min(), x.max(), 300)
ax2.plot(xs, poly(xs))
ax2.set_title("Processing Time vs Complexity")
# PANEL 3 β€” DISTRIBUTION
ax3 = plt.subplot2grid((3, 2), (1, 1))
ax3.hist(df_out["complexity_score"], bins=20, edgecolor="black", alpha=0.8, density=True)
values = df_out["complexity_score"].dropna().values
kde = gaussian_kde(values)
xx = np.linspace(values.min(), values.max(), 1000)
ax3.plot(xx, kde(xx), linewidth=1.5)
ax3.set_title("Distribution of Complexity Scores")
plt.tight_layout()
st.pyplot(fig)
# =====================
# CORRELATIONS
# =====================
st.subheader("πŸ“‘ Domain-Wise Correlations")
domains = sorted(df["Domain"].unique())
n_domains = len(domains)
n_rows = 2
n_cols = math.ceil(n_domains / n_rows)
fig, axes = plt.subplots(n_rows, int(np.ceil(n_domains / 2)), figsize=(n_cols * 3.5, n_rows * 3.5))
axes = axes.flatten()
for i, dom in enumerate(domains):
sub = df[df["Domain"] == dom][metrics]
corr = sub.corr()
sns.heatmap(corr, cmap="coolwarm", square=True, cbar=False, linewidths=0.2,
xticklabels=False, yticklabels=False, ax=axes[i])
axes[i].set_title(dom, fontsize=13)
for j in range(i + 1, len(axes)):
axes[j].axis("off")
plt.tight_layout(rect=[0, 0, 1, 1])
plt.tight_layout()
st.pyplot(fig)