File size: 9,916 Bytes
d97a439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5813aac
d97a439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
928a132
d97a439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import time

import networkx as nx
import numpy as np
import pandas as pd
import streamlit as st
from src.config import config
from src.embedding import Embedder
from src.utils import (create_graph_from_df, gather_neighbors,
                       get_unique_article_titles)
from src.heuristic import predict_topic_nth_degree
from src.gnn import GNNClassifier, load_data, infer_new_node
from st_link_analysis import EdgeStyle, NodeStyle, st_link_analysis
from src.visualization import get_edge_styles, get_node_styles

import torch

st.set_page_config(
    page_title="Semantic Article Graph", layout="wide", initial_sidebar_state="expanded"
)

if "setup_complete" not in st.session_state:
    loader = st.empty()

    with loader.container():
        st.subheader("πŸš€ Starting...")

        with st.status("Loading...", expanded=True) as status:
            st.write("Initializing Embedding Model...")
            embedder = Embedder(path=config.EMBEDDING_MODEL_PATH)
            st.session_state.embedder = embedder

            st.write("Initializing GNN Model (Undirected)...")
            undirected_graph_data, undirected_title_to_id, undirected_label_mapping = load_data(version="undirected")
            undirected_gnn_model = GNNClassifier(
                input_dim=768,
                hidden_dim=128,
                layers=2,
                output_dim=len(undirected_label_mapping),
                dropout_rate=0.5,
            )
            undirected_gnn_model.load_state_dict(
                torch.load(config.GNN_MODEL_PATH, map_location=torch.device("cpu"))
            )
            st.session_state.undirected_gnn_model = undirected_gnn_model
            st.session_state.undirected_graph_data = undirected_graph_data
            st.session_state.undirected_title_to_id = undirected_title_to_id
            st.session_state.undirected_label_mapping = undirected_label_mapping

            st.write("Initializing GNN Model (No Edges)...")
            no_edge_graph_data, no_edge_title_to_id, no_edge_label_mapping = load_data(
                version="no_edge"
            )
            no_edge_gnn_model = GNNClassifier(
                input_dim=768,
                hidden_dim=128,
                layers=2,
                output_dim=len(no_edge_label_mapping),
                dropout_rate=0.5,
            )
            no_edge_gnn_model.load_state_dict(
                torch.load(config.GNN_MODEL_PATH.replace("undirected_gnn", "no_edge_gnn"), map_location=torch.device("cpu"))
            )
            st.session_state.no_edge_gnn_model = no_edge_gnn_model
            st.session_state.no_edge_graph_data = no_edge_graph_data
            st.session_state.no_edge_title_to_id = no_edge_title_to_id
            st.session_state.no_edge_label_mapping = no_edge_label_mapping

            st.write("Reading training data...")
            training_data = pd.read_parquet(config.TRAINING_DATA_PATH)
            training_data["embedding"] = training_data["embedding"].apply(lambda x: eval(x))
            st.session_state.training_data = training_data

            st.write("Creating graph for visualization...")
            directed_graph = create_graph_from_df(training_data, directed=True)
            st.session_state.directed_graph = directed_graph
            undirected_graph = create_graph_from_df(training_data, directed=False)
            st.session_state.undirected_graph = undirected_graph

            status.update(label="Done!", state="complete", expanded=False)

        time.sleep(0.5)

    loader.empty()
    st.session_state.setup_complete = True


node_styles = get_node_styles()
edge_styles = get_edge_styles()

if "existing_nodes" not in st.session_state:
    article_titles = get_unique_article_titles(st.session_state.training_data)
    st.session_state.existing_nodes = article_titles

CLASSES = list(config.ICON_MAPPING.keys())


def get_dummy_probabilities():
    """Generates random probabilities for the classes."""
    probs = np.random.dirichlet(np.ones(len(CLASSES)), size=1)[0]
    data = pd.DataFrame({"Class": CLASSES, "Score": probs})
    # Sort by Score descending
    return data.sort_values(by="Score", ascending=False).head(10)


st.title("πŸ“„ Semantic Article Graph")
st.markdown("---")

col_input, col_vis = st.columns([1, 2], gap="large")

with col_input:
    st.subheader("1. New Node Details")

    new_title = st.text_input("Node Title", placeholder="e.g., Istanbul")
    new_content = st.text_area(
        "Content", height=150, placeholder="Paste content here..."
    )

    references = st.multiselect(
        "References (Select existing nodes)",
        options=st.session_state.existing_nodes,
        help="Search and select multiple papers this node cites.",
    )

    st.markdown("---")
    st.subheader("2. Methodology Configuration")

    method = st.selectbox(
        "Select Classification Method",
        ["GNN (Graph Neural Network)", "Rule-Based"],
    )

    model_params = {}
    is_directed = False
    max_depth = 2

    if method == "GNN (Graph Neural Network)":
        use_edges = st.checkbox("Use Graph Edges", value=True)

    elif method == "Rule-Based":
        max_depth = st.slider("Max Depth", 1, 3, 1)
        is_weighted = st.checkbox("Apply Weights", value=True)
        is_directed = st.checkbox("Use Directed Graph", value=False)
        model_params = {"max_depth": max_depth, "is_weighted": is_weighted}
    else:
        st.warning("Please select a valid method.")

    st.markdown("---")

    run_inference = st.button(
        "Add Node & Run Inference", type="primary", width="stretch"
    )


with col_vis:
    if run_inference:
        if not new_title:
            st.error("Please enter a title for the node.")
        else:
            st.subheader(f"🌐 Graph Neighborhood (k-hop)")

            with st.spinner("Updating Graph Topology..."):
                time.sleep(1)

                graph_container = st.container(border=True)
                with graph_container:
                    graph = (
                        st.session_state.directed_graph
                        if is_directed
                        else st.session_state.undirected_graph
                    )
                    elements = gather_neighbors(
                        graph, new_title, references, depth=max_depth
                    )
                    st_link_analysis(elements, "cose", node_styles, edge_styles)
                    st.caption(
                        f"Visualizing neighbors for: **{new_title}** with {len(references)} connections."
                    )

            st.markdown("---")
            st.subheader("πŸ“Š Classification Results")

            with st.spinner(f"Running {method}..."):
                time.sleep(1.5)
                embedding = st.session_state.embedder.generate_embedding(new_content)
                if method == "GNN (Graph Neural Network)":
                    base_data = st.session_state.undirected_graph_data if use_edges else st.session_state.no_edge_graph_data
                    title_to_id = st.session_state.undirected_title_to_id if use_edges else st.session_state.no_edge_title_to_id
                    label_mapping = st.session_state.undirected_label_mapping if use_edges else st.session_state.no_edge_label_mapping
                    model = st.session_state.undirected_gnn_model if use_edges else st.session_state.no_edge_gnn_model
                    df_results = infer_new_node(
                        base_data=base_data,
                        model=model,
                        new_embedding=embedding,
                        referenced_titles=references,
                        title_to_id=title_to_id,
                        label_mapping=label_mapping,
                        device=torch.device("cpu"),
                        make_undirected_for_new_node=not is_directed,
                        use_edges=use_edges,
                    )
                elif method == "Rule-Based":
                    graph = (
                        st.session_state.directed_graph
                        if is_directed
                        else st.session_state.undirected_graph
                    )
                    df_results = predict_topic_nth_degree(
                        new_article_title=new_title,
                        new_article_embedding=embedding,
                        edges=references,
                        G=graph,
                        decay_factor=1.0,
                        **model_params,
                    )
                else:
                    st.error("Invalid method selected.")
                    st.stop()
                    

                top_class = df_results.iloc[0]
                st.success(
                    f"**Predicted Class:** {top_class['Class']} ({top_class['Score']:.2%})"
                )

                st.dataframe(
                    df_results,
                    column_config={
                        "Class": "Class Name",
                        "Score": st.column_config.ProgressColumn(
                            "Confidence",
                            help="The model's confidence score",
                            format="%.2f",
                            min_value=0,
                            max_value=1,
                        ),
                    },
                    hide_index=True,
                    width="stretch",
                )

    else:
        st.info(
            "πŸ‘ˆ Enter node details on the left and click 'Add' to see the graph and predictions."
        )
        st.markdown(
            """
            <div style="height: 600px; border: 2px dashed #ccc; border-radius: 10px; 
            display: flex; align-items: center; justify-content: center; color: #ccc;">
                Waiting for input...
            </div>
            """,
            unsafe_allow_html=True,
        )