Spaces:

Metin
/

DataMiningProjectDemo

Sleeping

File size: 9,916 Bytes

import time

import networkx as nx
import numpy as np
import pandas as pd
import streamlit as st
from src.config import config
from src.embedding import Embedder
from src.utils import (create_graph_from_df, gather_neighbors,
                       get_unique_article_titles)
from src.heuristic import predict_topic_nth_degree
from src.gnn import GNNClassifier, load_data, infer_new_node
from st_link_analysis import EdgeStyle, NodeStyle, st_link_analysis
from src.visualization import get_edge_styles, get_node_styles

import torch

st.set_page_config(
    page_title="Semantic Article Graph", layout="wide", initial_sidebar_state="expanded"
)

if "setup_complete" not in st.session_state:
    loader = st.empty()

    with loader.container():
        st.subheader("🚀 Starting...")

        with st.status("Loading...", expanded=True) as status:
            st.write("Initializing Embedding Model...")
            embedder = Embedder(path=config.EMBEDDING_MODEL_PATH)
            st.session_state.embedder = embedder

            st.write("Initializing GNN Model (Undirected)...")
            undirected_graph_data, undirected_title_to_id, undirected_label_mapping = load_data(version="undirected")
            undirected_gnn_model = GNNClassifier(
                input_dim=768,
                hidden_dim=128,
                layers=2,
                output_dim=len(undirected_label_mapping),
                dropout_rate=0.5,
            )
            undirected_gnn_model.load_state_dict(
                torch.load(config.GNN_MODEL_PATH, map_location=torch.device("cpu"))
            )
            st.session_state.undirected_gnn_model = undirected_gnn_model
            st.session_state.undirected_graph_data = undirected_graph_data
            st.session_state.undirected_title_to_id = undirected_title_to_id
            st.session_state.undirected_label_mapping = undirected_label_mapping

            st.write("Initializing GNN Model (No Edges)...")
            no_edge_graph_data, no_edge_title_to_id, no_edge_label_mapping = load_data(
                version="no_edge"
            )
            no_edge_gnn_model = GNNClassifier(
                input_dim=768,
                hidden_dim=128,
                layers=2,
                output_dim=len(no_edge_label_mapping),
                dropout_rate=0.5,
            )
            no_edge_gnn_model.load_state_dict(
                torch.load(config.GNN_MODEL_PATH.replace("undirected_gnn", "no_edge_gnn"), map_location=torch.device("cpu"))
            )
            st.session_state.no_edge_gnn_model = no_edge_gnn_model
            st.session_state.no_edge_graph_data = no_edge_graph_data
            st.session_state.no_edge_title_to_id = no_edge_title_to_id
            st.session_state.no_edge_label_mapping = no_edge_label_mapping

            st.write("Reading training data...")
            training_data = pd.read_parquet(config.TRAINING_DATA_PATH)
            training_data["embedding"] = training_data["embedding"].apply(lambda x: eval(x))
            st.session_state.training_data = training_data

            st.write("Creating graph for visualization...")
            directed_graph = create_graph_from_df(training_data, directed=True)
            st.session_state.directed_graph = directed_graph
            undirected_graph = create_graph_from_df(training_data, directed=False)
            st.session_state.undirected_graph = undirected_graph

            status.update(label="Done!", state="complete", expanded=False)

        time.sleep(0.5)

    loader.empty()
    st.session_state.setup_complete = True


node_styles = get_node_styles()
edge_styles = get_edge_styles()

if "existing_nodes" not in st.session_state:
    article_titles = get_unique_article_titles(st.session_state.training_data)
    st.session_state.existing_nodes = article_titles

CLASSES = list(config.ICON_MAPPING.keys())


def get_dummy_probabilities():
    """Generates random probabilities for the classes."""
    probs = np.random.dirichlet(np.ones(len(CLASSES)), size=1)[0]
    data = pd.DataFrame({"Class": CLASSES, "Score": probs})
    # Sort by Score descending
    return data.sort_values(by="Score", ascending=False).head(10)


st.title("📄 Semantic Article Graph")
st.markdown("---")

col_input, col_vis = st.columns([1, 2], gap="large")

with col_input:
    st.subheader("1. New Node Details")

    new_title = st.text_input("Node Title", placeholder="e.g., Istanbul")
    new_content = st.text_area(
        "Content", height=150, placeholder="Paste content here..."
    )

    references = st.multiselect(
        "References (Select existing nodes)",
        options=st.session_state.existing_nodes,
        help="Search and select multiple papers this node cites.",
    )

    st.markdown("---")
    st.subheader("2. Methodology Configuration")

    method = st.selectbox(
        "Select Classification Method",
        ["GNN (Graph Neural Network)", "Rule-Based"],
    )

    model_params = {}
    is_directed = False
    max_depth = 2

    if method == "GNN (Graph Neural Network)":
        use_edges = st.checkbox("Use Graph Edges", value=True)

    elif method == "Rule-Based":
        max_depth = st.slider("Max Depth", 1, 3, 1)
        is_weighted = st.checkbox("Apply Weights", value=True)
        is_directed = st.checkbox("Use Directed Graph", value=False)
        model_params = {"max_depth": max_depth, "is_weighted": is_weighted}
    else:
        st.warning("Please select a valid method.")

    st.markdown("---")

    run_inference = st.button(
        "Add Node & Run Inference", type="primary", width="stretch"
    )


with col_vis:
    if run_inference:
        if not new_title:
            st.error("Please enter a title for the node.")
        else:
            st.subheader(f"🌐 Graph Neighborhood (k-hop)")

            with st.spinner("Updating Graph Topology..."):
                time.sleep(1)

                graph_container = st.container(border=True)
                with graph_container:
                    graph = (
                        st.session_state.directed_graph
                        if is_directed
                        else st.session_state.undirected_graph
                    )
                    elements = gather_neighbors(
                        graph, new_title, references, depth=max_depth
                    )
                    st_link_analysis(elements, "cose", node_styles, edge_styles)
                    st.caption(
                        f"Visualizing neighbors for: **{new_title}** with {len(references)} connections."
                    )

            st.markdown("---")
            st.subheader("📊 Classification Results")

            with st.spinner(f"Running {method}..."):
                time.sleep(1.5)
                embedding = st.session_state.embedder.generate_embedding(new_content)
                if method == "GNN (Graph Neural Network)":
                    base_data = st.session_state.undirected_graph_data if use_edges else st.session_state.no_edge_graph_data
                    title_to_id = st.session_state.undirected_title_to_id if use_edges else st.session_state.no_edge_title_to_id
                    label_mapping = st.session_state.undirected_label_mapping if use_edges else st.session_state.no_edge_label_mapping
                    model = st.session_state.undirected_gnn_model if use_edges else st.session_state.no_edge_gnn_model
                    df_results = infer_new_node(
                        base_data=base_data,
                        model=model,
                        new_embedding=embedding,
                        referenced_titles=references,
                        title_to_id=title_to_id,
                        label_mapping=label_mapping,
                        device=torch.device("cpu"),
                        make_undirected_for_new_node=not is_directed,
                        use_edges=use_edges,
                    )
                elif method == "Rule-Based":
                    graph = (
                        st.session_state.directed_graph
                        if is_directed
                        else st.session_state.undirected_graph
                    )
                    df_results = predict_topic_nth_degree(
                        new_article_title=new_title,
                        new_article_embedding=embedding,
                        edges=references,
                        G=graph,
                        decay_factor=1.0,
                        **model_params,
                    )
                else:
                    st.error("Invalid method selected.")
                    st.stop()
                    

                top_class = df_results.iloc[0]
                st.success(
                    f"**Predicted Class:** {top_class['Class']} ({top_class['Score']:.2%})"
                )

                st.dataframe(
                    df_results,
                    column_config={
                        "Class": "Class Name",
                        "Score": st.column_config.ProgressColumn(
                            "Confidence",
                            help="The model's confidence score",
                            format="%.2f",
                            min_value=0,
                            max_value=1,
                        ),
                    },
                    hide_index=True,
                    width="stretch",
                )

    else:
        st.info(
            "👈 Enter node details on the left and click 'Add' to see the graph and predictions."
        )
        st.markdown(
            """
            <div style="height: 600px; border: 2px dashed #ccc; border-radius: 10px; 
            display: flex; align-items: center; justify-content: center; color: #ccc;">
                Waiting for input...
            </div>
            """,
            unsafe_allow_html=True,
        )