| import os |
| import numpy as np |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| from scipy.stats import gaussian_kde |
| import streamlit as st |
| import math |
|
|
| |
| |
| |
| os.environ["STREAMLIT_CONFIG_DIR"] = os.path.join(os.getcwd(), ".streamlit") |
| os.makedirs(os.environ["STREAMLIT_CONFIG_DIR"], exist_ok=True) |
|
|
| st.set_page_config(page_title="OntoLearner Benchmark", layout="wide") |
|
|
| st.markdown( |
| """ |
| <style> |
| html, body, .main { |
| height: 100%; |
| width: 100%; |
| margin: 0; |
| padding: 0; |
| } |
| .block-container { |
| padding: 1rem; |
| margin: 0; |
| height: 100%; |
| width: 100%; |
| } |
| </style> |
| """, |
| unsafe_allow_html=True |
| ) |
|
|
| st.title("OntoLearner Benchmark β Ontology Metrics Dashboard") |
|
|
| |
| |
| |
| df = pd.read_excel("metrics.xlsx") |
| st.subheader("π Ontology Metrics Table") |
| st.dataframe(df, use_container_width=True) |
|
|
| |
| |
| |
| st.subheader("π Statistical Summary for All Metrics") |
|
|
| metric_cols = [ |
| "total_nodes", "num_classes", "num_properties", "num_individuals", |
| "avg_depth", "avg_breadth", "Processing Time (s)" |
| ] |
|
|
| df_metrics = df[metric_cols] |
|
|
| summary = df_metrics.describe().T |
| summary["missing"] = df_metrics.isnull().sum() |
| summary = summary.round(2) |
|
|
| slight_summary = summary[['mean', 'std', '25%', '50%', '75%', 'max']] |
|
|
| st.dataframe(slight_summary, use_container_width=True) |
|
|
|
|
| |
| |
| |
| st.subheader("βοΈ Complexity Score Computation") |
|
|
| metrics = [ |
| "total_nodes", "total_edges", "num_root_nodes", "num_leaf_nodes", "num_classes", |
| "num_properties", "num_individuals", "max_depth", "min_depth", "avg_depth", |
| "depth_variance", "max_breadth", "min_breadth", "avg_breadth", "breadth_variance", |
| "num_term_types", "num_taxonomic_relations", "num_non_taxonomic_relations", |
| "avg_terms", |
| ] |
|
|
| graph_metrics = ["total_nodes", "total_edges", "num_root_nodes", "num_leaf_nodes"] |
| coverage_metrics = ["num_classes", "num_properties", "num_individuals"] |
| hierarchy_metrics = ["max_depth", "min_depth", "avg_depth", "depth_variance"] |
| breadth_metrics = ["max_breadth", "min_breadth", "avg_breadth", "breadth_variance"] |
| llms4ol_metrics = ["num_term_types", "num_taxonomic_relations", "num_non_taxonomic_relations", "avg_terms"] |
|
|
| weights = {} |
| for c in metrics: |
| if c in graph_metrics: weights[c] = 0.3 |
| elif c in coverage_metrics: weights[c] = 0.25 |
| elif c in hierarchy_metrics: weights[c] = 0.10 |
| elif c in breadth_metrics: weights[c] = 0.20 |
| elif c in llms4ol_metrics: weights[c] = 0.15 |
|
|
| def log_normalize(x): |
| return np.log1p(x) |
|
|
| def complexity_score(onto_metric, a=0.4, b=6.0, eps=1e-12): |
| norm_metric = {metric: log_normalize(onto_metric[metric]) for metric in metrics} |
| weighted_norm = {m: norm_metric[m] * weights[m] for m in weights} |
| c_score = sum(weighted_norm.values()) / sum(weights.values()) |
| c_score = 1.0 / (1.0 + np.exp(-a * (c_score - b) + eps)) |
| return c_score |
|
|
| cs = [complexity_score(dict(row)) for _, row in df.iterrows()] |
| df_out = df.copy() |
| df_out["complexity_score"] = cs |
| df_out["complexity_rank"] = df_out["complexity_score"].rank(method="min", ascending=False).astype(int) |
|
|
| st.write("The following table represents the ontologies with complexity score and their ranking based on this score.") |
| st.dataframe(df_out, use_container_width=True) |
|
|
|
|
| |
| |
| |
| st.subheader("π Complexity Score Visualizations") |
|
|
| top_n_val = 15 |
|
|
| fig = plt.figure(figsize=(14, 11)) |
|
|
| |
| ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2) |
| topn = df_out.sort_values("complexity_score", ascending=False).head(top_n_val) |
|
|
| ax1.barh(topn["Ontology ID"].astype(str), topn["complexity_score"], color="#4C72B0") |
| ax1.invert_yaxis() |
| ax1.set_title(f"Top {top_n_val} Ontologies by Complexity Score") |
|
|
| |
| ax2 = plt.subplot2grid((3, 2), (1, 0)) |
| ax2.scatter(df_out["complexity_score"], df_out["Processing Time (s)"], |
| alpha=0.4, s=60, edgecolor="black") |
|
|
| x = df_out["complexity_score"].values |
| y = df_out["Processing Time (s)"].values |
|
|
| coeffs = np.polyfit(x, y, 4) |
| poly = np.poly1d(coeffs) |
| xs = np.linspace(x.min(), x.max(), 300) |
| ax2.plot(xs, poly(xs)) |
|
|
| ax2.set_title("Processing Time vs Complexity") |
|
|
| |
| ax3 = plt.subplot2grid((3, 2), (1, 1)) |
| ax3.hist(df_out["complexity_score"], bins=20, edgecolor="black", alpha=0.8, density=True) |
|
|
| values = df_out["complexity_score"].dropna().values |
| kde = gaussian_kde(values) |
| xx = np.linspace(values.min(), values.max(), 1000) |
| ax3.plot(xx, kde(xx), linewidth=1.5) |
|
|
| ax3.set_title("Distribution of Complexity Scores") |
| plt.tight_layout() |
|
|
| st.pyplot(fig) |
|
|
|
|
| |
| |
| |
| st.subheader("π‘ Domain-Wise Correlations") |
|
|
| domains = sorted(df["Domain"].unique()) |
| n_domains = len(domains) |
|
|
| n_rows = 2 |
| n_cols = math.ceil(n_domains / n_rows) |
|
|
| fig, axes = plt.subplots(n_rows, int(np.ceil(n_domains / 2)), figsize=(n_cols * 3.5, n_rows * 3.5)) |
| axes = axes.flatten() |
|
|
| for i, dom in enumerate(domains): |
| sub = df[df["Domain"] == dom][metrics] |
| corr = sub.corr() |
|
|
| sns.heatmap(corr, cmap="coolwarm", square=True, cbar=False, linewidths=0.2, |
| xticklabels=False, yticklabels=False, ax=axes[i]) |
| axes[i].set_title(dom, fontsize=13) |
|
|
| for j in range(i + 1, len(axes)): |
| axes[j].axis("off") |
|
|
| plt.tight_layout(rect=[0, 0, 1, 1]) |
| plt.tight_layout() |
| st.pyplot(fig) |