import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import gaussian_kde
import streamlit as st
import math

# =====================
# STREAMLIT CONFIG
# =====================
os.environ["STREAMLIT_CONFIG_DIR"] = os.path.join(os.getcwd(), ".streamlit")
os.makedirs(os.environ["STREAMLIT_CONFIG_DIR"], exist_ok=True)

st.set_page_config(page_title="OntoLearner Benchmark")

st.markdown(
    """
    <style>
    html, body, .main {
        height: 100%;
        width: 100%;
        margin: 0;
        padding: 0;
    }
    .block-container {
        padding: 1rem;
        margin: 0;
        height: 100%;
        width: 100%;
    }
    </style>
    """,
    unsafe_allow_html=True
)


st.title("OntoLearner Benchmark – Ontology Metrics Dashboard")

# Create a container at the top
with st.container():
    st.markdown(
        """
        <style>
        .logo-container {
            display: flex;
            justify-content: center;
            align-items: center;
            margin-top: 10px;
        }
        </style>
        <div class="logo-container">
            <img src="https://raw.githubusercontent.com/sciknoworg/OntoLearner/main/images/logo.png" width="300">
        </div>
        """,
        unsafe_allow_html=True
    )
    

# st.subheader("ℹ️ About OntoLearner")
st.markdown(
    """
    **OntoLearner** is an automated ontology learning framework designed to extract, structure, and enrich knowledge from textual data. It provides a modular pipeline combining NLP, machine learning, and ontology engineering
    principles to generate OWL ontologies with high interpretability and consistency.
    """
)
st.markdown("---")
# =====================
# LOAD DATA
# =====================
df = pd.read_excel("metrics.xlsx")
st.subheader("📘 Ontology Metrics Table")
st.dataframe(df, use_container_width=True)

# =====================
# SUMMARY STATISTICS
# =====================
st.subheader("📊 Statistical Summary")

st.write("#### Distribution of ontologies per domain.")
domain_stats = df["Domain"].value_counts().reset_index()
domain_stats.columns = ["Domain", "count"]
domain_stats["percentage"] = (domain_stats["count"] / domain_stats["count"].sum()) * 100
fig, ax = plt.subplots(figsize=(12, 6))
sns.set_style("whitegrid")
ax = sns.barplot(
    data=domain_stats,
    x="Domain",
    y="count",
    palette="viridis"
)
ax.set_xlabel("")
ax.set_ylabel("")

for i, row in domain_stats.iterrows():
    ax.text(
        i,
        row["count"] + 0.02 * domain_stats["count"].max(),
        f"{row['count']}\n({row['percentage']:.1f}%)",
        ha="center",
        fontsize=9,
    )
    
plt.title("Number and Percentage of Ontologies per Domain", fontsize=14)
plt.xlabel("Domain", fontsize=12)
plt.ylabel("Number of Ontologies", fontsize=12)
plt.xticks(rotation=90, ha="right")
plt.tight_layout()
st.pyplot(fig)


st.write("#### Statistical summary of key metrics.")
metric_cols = ["total_nodes", "num_classes", "num_properties", "num_individuals", 
               "avg_depth", "avg_breadth", "Processing Time (s)"]
df_metrics = df[metric_cols]
summary = df_metrics.describe().T
summary["missing"] = df_metrics.isnull().sum()
summary = summary.round(2)
slight_summary = summary[['mean', 'std', '25%', '50%', '75%', 'max']]
st.dataframe(slight_summary, use_container_width=True)


# =====================
# COMPLEXITY SCORE
# =====================
st.subheader("⚙️ Complexity Score Computation")

metrics = [
    "total_nodes", "total_edges", "num_root_nodes", "num_leaf_nodes", "num_classes",
    "num_properties", "num_individuals", "max_depth", "min_depth", "avg_depth",
    "depth_variance", "max_breadth", "min_breadth", "avg_breadth", "breadth_variance",
    "num_term_types", "num_taxonomic_relations", "num_non_taxonomic_relations",
    "avg_terms",
]

graph_metrics = ["total_nodes", "total_edges", "num_root_nodes", "num_leaf_nodes"]
coverage_metrics = ["num_classes", "num_properties", "num_individuals"]
hierarchy_metrics = ["max_depth", "min_depth", "avg_depth", "depth_variance"]
breadth_metrics = ["max_breadth", "min_breadth", "avg_breadth", "breadth_variance"]
llms4ol_metrics = ["num_term_types", "num_taxonomic_relations", "num_non_taxonomic_relations", "avg_terms"]

weights = {}
for c in metrics:
    if c in graph_metrics: weights[c] = 0.3
    elif c in coverage_metrics: weights[c] = 0.25
    elif c in hierarchy_metrics: weights[c] = 0.10
    elif c in breadth_metrics: weights[c] = 0.20
    elif c in llms4ol_metrics: weights[c] = 0.15

def log_normalize(x):
    return np.log1p(x)

def complexity_score(onto_metric, a=0.4, b=6.0, eps=1e-12):
    norm_metric = {metric: log_normalize(onto_metric[metric]) for metric in metrics}
    weighted_norm = {m: norm_metric[m] * weights[m] for m in weights}
    c_score = sum(weighted_norm.values()) / sum(weights.values())
    c_score = 1.0 / (1.0 + np.exp(-a * (c_score - b) + eps))
    return c_score

cs = [complexity_score(dict(row)) for _, row in df.iterrows()]
df_out = df.copy()
df_out["complexity_score"] = cs
df_out["complexity_rank"] = df_out["complexity_score"].rank(method="min", ascending=False).astype(int)

st.write("The following table represents the ontologies with complexity score and their ranking based on this score.")
st.dataframe(df_out, use_container_width=True)


# =====================
# VISUALIZATION
# =====================
st.subheader("📈 Complexity Score Visualizations")

top_n_val = 15

fig = plt.figure(figsize=(14, 11))

# PANEL 1 — TOP N BY COMPLEXITY
ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2)
topn = df_out.sort_values("complexity_score", ascending=False).head(top_n_val)

ax1.barh(topn["Ontology ID"].astype(str), topn["complexity_score"], color="#4C72B0")
ax1.invert_yaxis()
ax1.set_title(f"Top {top_n_val} Ontologies by Complexity Score")
ax1.set_xlabel("Complexity Score", fontsize=12)

# PANEL 2 — PROCESSING TIME VS COMPLEXITY
ax2 = plt.subplot2grid((3, 2), (1, 0))
ax2.scatter(df_out["complexity_score"], df_out["Processing Time (s)"],
            alpha=0.4, s=60, edgecolor="black", linewidth=0.5, color="#DD8452")

x = df_out["complexity_score"].values
y = df_out["Processing Time (s)"].values

coeffs = np.polyfit(x, y, 4)
poly = np.poly1d(coeffs)
xs = np.linspace(x.min(), x.max(), 300)
ax2.plot(xs, poly(xs), color="#DD8452", linewidth=1.5)

ax2.set_title("Processing Time vs Complexity")
ax2.set_xlabel("Complexity Score", fontsize=12)
ax2.set_ylabel("Processing Time (s)", fontsize=12)

# PANEL 3 — DISTRIBUTION
ax3 = plt.subplot2grid((3, 2), (1, 1))
ax3.hist(df_out["complexity_score"], bins=20, color="#55A868", edgecolor="black", alpha=0.8, density=True)

values = df_out["complexity_score"].dropna().values
kde = gaussian_kde(values)
xx = np.linspace(values.min(), values.max(), 1000)
ax3.plot(xx, kde(xx), linewidth=1.5, color="green")

ax3.set_title("Distribution of Complexity Scores")
ax3.set_xlabel("Complexity Score", fontsize=12)
ax3.set_ylabel("Density", fontsize=12)
plt.tight_layout(rect=[0, 0, 1, 0.97])

st.pyplot(fig)


# =====================
# CORRELATIONS
# =====================
st.subheader("📡 Domain-Wise Correlations")

domains = sorted(df["Domain"].unique())
n_domains = len(domains)

n_rows = 2
n_cols = math.ceil(n_domains / n_rows)

fig, axes = plt.subplots(n_rows, int(np.ceil(n_domains / 2)), figsize=(n_cols * 3.5, n_rows * 3.5))
axes = axes.flatten()

for i, dom in enumerate(domains):
    sub = df[df["Domain"] == dom][metrics]
    corr = sub.corr()

    sns.heatmap(corr, cmap="coolwarm", square=True, cbar=False, linewidths=0.2,
                xticklabels=False, yticklabels=False, ax=axes[i])
    axes[i].set_title(dom, fontsize=13)

for j in range(i + 1, len(axes)):
    axes[j].axis("off")

plt.tight_layout(rect=[0, 0, 1, 1])
st.pyplot(fig)

st.write("\n\n")

st.markdown(
    """
    ## 🔗 Useful Links
    - 📦 GitHub Repository: [sciknoworg/OntoLearner](https://github.com/sciknoworg/OntoLearner)
    - 📚 Documentation: [ontolearner.readthedocs.io](https://ontolearner.readthedocs.io/)
    - 💡 Acknowledgements: OntoLearner is developed and maintained by the **SciKnow Research Group**.

    Moreover:
    
    - If you encounter any issues or have questions, please submit them in the [GitHub issues tracker](https://github.com/sciknoworg/OntoLearner/issues).
    - If you find this repository helpful or use OntoLearner in your work or research, feel free to cite our publication:

    ```bibtex
    @inproceedings{babaei2023llms4ol,
      title={LLMs4OL: Large language models for ontology learning},
      author={Babaei Giglou, Hamed and D’Souza, Jennifer and Auer, S{\"o}ren},
      booktitle={International Semantic Web Conference},
      pages={408--427},
      year={2023},
      organization={Springer}
    }
    ```
    or:
    ```bibtex
    @software{babaei_giglou_2025_15399783,
      author       = {Babaei Giglou, Hamed and D'Souza, Jennifer and Aioanei, Andrei and Mihindukulasooriya, Nandana and Auer, Sören},
      title        = {OntoLearner: A Modular Python Library for Ontology Learning with LLMs},
      month        = may,
      year         = 2025,
      publisher    = {Zenodo},
      version      = {v1.3.0},
      doi          = {10.5281/zenodo.15399783},
      url          = {https://doi.org/10.5281/zenodo.15399783},
    }
    ```

    ------------
    This OntoLearner is licensed under [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT).


    """
)