|
|
import os |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from scipy.stats import gaussian_kde |
|
|
import streamlit as st |
|
|
import math |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
os.environ["STREAMLIT_CONFIG_DIR"] = os.path.join(os.getcwd(), ".streamlit") |
|
|
os.makedirs(os.environ["STREAMLIT_CONFIG_DIR"], exist_ok=True) |
|
|
|
|
|
st.set_page_config(page_title="OntoLearner Benchmark") |
|
|
|
|
|
st.markdown( |
|
|
""" |
|
|
<style> |
|
|
html, body, .main { |
|
|
height: 100%; |
|
|
width: 100%; |
|
|
margin: 0; |
|
|
padding: 0; |
|
|
} |
|
|
.block-container { |
|
|
padding: 1rem; |
|
|
margin: 0; |
|
|
height: 100%; |
|
|
width: 100%; |
|
|
} |
|
|
</style> |
|
|
""", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
st.title("OntoLearner Benchmark β Ontology Metrics Dashboard") |
|
|
|
|
|
|
|
|
with st.container(): |
|
|
st.markdown( |
|
|
""" |
|
|
<style> |
|
|
.logo-container { |
|
|
display: flex; |
|
|
justify-content: center; |
|
|
align-items: center; |
|
|
margin-top: 10px; |
|
|
} |
|
|
</style> |
|
|
<div class="logo-container"> |
|
|
<img src="https://raw.githubusercontent.com/sciknoworg/OntoLearner/main/images/logo.png" width="300"> |
|
|
</div> |
|
|
""", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
st.markdown( |
|
|
""" |
|
|
**OntoLearner** is an automated ontology learning framework designed to extract, structure, and enrich knowledge from textual data. It provides a modular pipeline combining NLP, machine learning, and ontology engineering |
|
|
principles to generate OWL ontologies with high interpretability and consistency. |
|
|
""" |
|
|
) |
|
|
st.markdown("---") |
|
|
|
|
|
|
|
|
|
|
|
df = pd.read_excel("metrics.xlsx") |
|
|
st.subheader("π Ontology Metrics Table") |
|
|
st.dataframe(df, use_container_width=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.subheader("π Statistical Summary") |
|
|
|
|
|
st.write("#### Distribution of ontologies per domain.") |
|
|
domain_stats = df["Domain"].value_counts().reset_index() |
|
|
domain_stats.columns = ["Domain", "count"] |
|
|
domain_stats["percentage"] = (domain_stats["count"] / domain_stats["count"].sum()) * 100 |
|
|
fig, ax = plt.subplots(figsize=(12, 6)) |
|
|
sns.set_style("whitegrid") |
|
|
ax = sns.barplot( |
|
|
data=domain_stats, |
|
|
x="Domain", |
|
|
y="count", |
|
|
palette="viridis" |
|
|
) |
|
|
ax.set_xlabel("") |
|
|
ax.set_ylabel("") |
|
|
|
|
|
for i, row in domain_stats.iterrows(): |
|
|
ax.text( |
|
|
i, |
|
|
row["count"] + 0.02 * domain_stats["count"].max(), |
|
|
f"{row['count']}\n({row['percentage']:.1f}%)", |
|
|
ha="center", |
|
|
fontsize=9, |
|
|
) |
|
|
|
|
|
plt.title("Number and Percentage of Ontologies per Domain", fontsize=14) |
|
|
plt.xlabel("Domain", fontsize=12) |
|
|
plt.ylabel("Number of Ontologies", fontsize=12) |
|
|
plt.xticks(rotation=90, ha="right") |
|
|
plt.tight_layout() |
|
|
st.pyplot(fig) |
|
|
|
|
|
|
|
|
st.write("#### Statistical summary of key metrics.") |
|
|
metric_cols = ["total_nodes", "num_classes", "num_properties", "num_individuals", |
|
|
"avg_depth", "avg_breadth", "Processing Time (s)"] |
|
|
df_metrics = df[metric_cols] |
|
|
summary = df_metrics.describe().T |
|
|
summary["missing"] = df_metrics.isnull().sum() |
|
|
summary = summary.round(2) |
|
|
slight_summary = summary[['mean', 'std', '25%', '50%', '75%', 'max']] |
|
|
st.dataframe(slight_summary, use_container_width=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.subheader("βοΈ Complexity Score Computation") |
|
|
|
|
|
metrics = [ |
|
|
"total_nodes", "total_edges", "num_root_nodes", "num_leaf_nodes", "num_classes", |
|
|
"num_properties", "num_individuals", "max_depth", "min_depth", "avg_depth", |
|
|
"depth_variance", "max_breadth", "min_breadth", "avg_breadth", "breadth_variance", |
|
|
"num_term_types", "num_taxonomic_relations", "num_non_taxonomic_relations", |
|
|
"avg_terms", |
|
|
] |
|
|
|
|
|
graph_metrics = ["total_nodes", "total_edges", "num_root_nodes", "num_leaf_nodes"] |
|
|
coverage_metrics = ["num_classes", "num_properties", "num_individuals"] |
|
|
hierarchy_metrics = ["max_depth", "min_depth", "avg_depth", "depth_variance"] |
|
|
breadth_metrics = ["max_breadth", "min_breadth", "avg_breadth", "breadth_variance"] |
|
|
llms4ol_metrics = ["num_term_types", "num_taxonomic_relations", "num_non_taxonomic_relations", "avg_terms"] |
|
|
|
|
|
weights = {} |
|
|
for c in metrics: |
|
|
if c in graph_metrics: weights[c] = 0.3 |
|
|
elif c in coverage_metrics: weights[c] = 0.25 |
|
|
elif c in hierarchy_metrics: weights[c] = 0.10 |
|
|
elif c in breadth_metrics: weights[c] = 0.20 |
|
|
elif c in llms4ol_metrics: weights[c] = 0.15 |
|
|
|
|
|
def log_normalize(x): |
|
|
return np.log1p(x) |
|
|
|
|
|
def complexity_score(onto_metric, a=0.4, b=6.0, eps=1e-12): |
|
|
norm_metric = {metric: log_normalize(onto_metric[metric]) for metric in metrics} |
|
|
weighted_norm = {m: norm_metric[m] * weights[m] for m in weights} |
|
|
c_score = sum(weighted_norm.values()) / sum(weights.values()) |
|
|
c_score = 1.0 / (1.0 + np.exp(-a * (c_score - b) + eps)) |
|
|
return c_score |
|
|
|
|
|
cs = [complexity_score(dict(row)) for _, row in df.iterrows()] |
|
|
df_out = df.copy() |
|
|
df_out["complexity_score"] = cs |
|
|
df_out["complexity_rank"] = df_out["complexity_score"].rank(method="min", ascending=False).astype(int) |
|
|
|
|
|
st.write("The following table represents the ontologies with complexity score and their ranking based on this score.") |
|
|
st.dataframe(df_out, use_container_width=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.subheader("π Complexity Score Visualizations") |
|
|
|
|
|
top_n_val = 15 |
|
|
|
|
|
fig = plt.figure(figsize=(14, 11)) |
|
|
|
|
|
|
|
|
ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2) |
|
|
topn = df_out.sort_values("complexity_score", ascending=False).head(top_n_val) |
|
|
|
|
|
ax1.barh(topn["Ontology ID"].astype(str), topn["complexity_score"], color="#4C72B0") |
|
|
ax1.invert_yaxis() |
|
|
ax1.set_title(f"Top {top_n_val} Ontologies by Complexity Score") |
|
|
ax1.set_xlabel("Complexity Score", fontsize=12) |
|
|
|
|
|
|
|
|
ax2 = plt.subplot2grid((3, 2), (1, 0)) |
|
|
ax2.scatter(df_out["complexity_score"], df_out["Processing Time (s)"], |
|
|
alpha=0.4, s=60, edgecolor="black", linewidth=0.5, color="#DD8452") |
|
|
|
|
|
x = df_out["complexity_score"].values |
|
|
y = df_out["Processing Time (s)"].values |
|
|
|
|
|
coeffs = np.polyfit(x, y, 4) |
|
|
poly = np.poly1d(coeffs) |
|
|
xs = np.linspace(x.min(), x.max(), 300) |
|
|
ax2.plot(xs, poly(xs), color="#DD8452", linewidth=1.5) |
|
|
|
|
|
ax2.set_title("Processing Time vs Complexity") |
|
|
ax2.set_xlabel("Complexity Score", fontsize=12) |
|
|
ax2.set_ylabel("Processing Time (s)", fontsize=12) |
|
|
|
|
|
|
|
|
ax3 = plt.subplot2grid((3, 2), (1, 1)) |
|
|
ax3.hist(df_out["complexity_score"], bins=20, color="#55A868", edgecolor="black", alpha=0.8, density=True) |
|
|
|
|
|
values = df_out["complexity_score"].dropna().values |
|
|
kde = gaussian_kde(values) |
|
|
xx = np.linspace(values.min(), values.max(), 1000) |
|
|
ax3.plot(xx, kde(xx), linewidth=1.5, color="green") |
|
|
|
|
|
ax3.set_title("Distribution of Complexity Scores") |
|
|
ax3.set_xlabel("Complexity Score", fontsize=12) |
|
|
ax3.set_ylabel("Density", fontsize=12) |
|
|
plt.tight_layout(rect=[0, 0, 1, 0.97]) |
|
|
|
|
|
st.pyplot(fig) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.subheader("π‘ Domain-Wise Correlations") |
|
|
|
|
|
domains = sorted(df["Domain"].unique()) |
|
|
n_domains = len(domains) |
|
|
|
|
|
n_rows = 2 |
|
|
n_cols = math.ceil(n_domains / n_rows) |
|
|
|
|
|
fig, axes = plt.subplots(n_rows, int(np.ceil(n_domains / 2)), figsize=(n_cols * 3.5, n_rows * 3.5)) |
|
|
axes = axes.flatten() |
|
|
|
|
|
for i, dom in enumerate(domains): |
|
|
sub = df[df["Domain"] == dom][metrics] |
|
|
corr = sub.corr() |
|
|
|
|
|
sns.heatmap(corr, cmap="coolwarm", square=True, cbar=False, linewidths=0.2, |
|
|
xticklabels=False, yticklabels=False, ax=axes[i]) |
|
|
axes[i].set_title(dom, fontsize=13) |
|
|
|
|
|
for j in range(i + 1, len(axes)): |
|
|
axes[j].axis("off") |
|
|
|
|
|
plt.tight_layout(rect=[0, 0, 1, 1]) |
|
|
st.pyplot(fig) |
|
|
|
|
|
st.write("\n\n") |
|
|
|
|
|
st.markdown( |
|
|
""" |
|
|
## π Useful Links |
|
|
- π¦ GitHub Repository: [sciknoworg/OntoLearner](https://github.com/sciknoworg/OntoLearner) |
|
|
- π Documentation: [ontolearner.readthedocs.io](https://ontolearner.readthedocs.io/) |
|
|
- π‘ Acknowledgements: OntoLearner is developed and maintained by the **SciKnow Research Group**. |
|
|
|
|
|
Moreover: |
|
|
|
|
|
- If you encounter any issues or have questions, please submit them in the [GitHub issues tracker](https://github.com/sciknoworg/OntoLearner/issues). |
|
|
- If you find this repository helpful or use OntoLearner in your work or research, feel free to cite our publication: |
|
|
|
|
|
```bibtex |
|
|
@inproceedings{babaei2023llms4ol, |
|
|
title={LLMs4OL: Large language models for ontology learning}, |
|
|
author={Babaei Giglou, Hamed and DβSouza, Jennifer and Auer, S{\"o}ren}, |
|
|
booktitle={International Semantic Web Conference}, |
|
|
pages={408--427}, |
|
|
year={2023}, |
|
|
organization={Springer} |
|
|
} |
|
|
``` |
|
|
or: |
|
|
```bibtex |
|
|
@software{babaei_giglou_2025_15399783, |
|
|
author = {Babaei Giglou, Hamed and D'Souza, Jennifer and Aioanei, Andrei and Mihindukulasooriya, Nandana and Auer, SΓΆren}, |
|
|
title = {OntoLearner: A Modular Python Library for Ontology Learning with LLMs}, |
|
|
month = may, |
|
|
year = 2025, |
|
|
publisher = {Zenodo}, |
|
|
version = {v1.3.0}, |
|
|
doi = {10.5281/zenodo.15399783}, |
|
|
url = {https://doi.org/10.5281/zenodo.15399783}, |
|
|
} |
|
|
``` |
|
|
|
|
|
------------ |
|
|
This OntoLearner is licensed under [](https://opensource.org/licenses/MIT). |
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
) |
|
|
|
|
|
|