File size: 4,431 Bytes
d97a439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import networkx as nx
import pandas as pd


def get_unique_article_titles(df: pd.DataFrame) -> list[str]:
    unique_articles = df["article_title_processed"].unique()
    unique_articles_sorted = sorted(unique_articles.tolist())
    return unique_articles_sorted


def create_graph_from_df(df, directed: bool = False) -> nx.Graph:
    G = nx.Graph()
    for i, row in df.iterrows():
        node_title = row["article_title_processed"]
        node_class = row["predicted_topic"]
        G.add_node(node_title, label=node_class, embedding=row["embedding"])

    for i, row in df.iterrows():
        node_title = row["article_title_processed"]
        references = eval(row["links_processed"])

        for ref in references:
            if ref in G and ref != node_title:
                G.add_edge(node_title, ref)

                if not directed:
                    G.add_edge(ref, node_title)

    return G


def gather_neighbors(
    graph: nx.DiGraph, node_title: str, references: list[str], depth: int = 1
):
    neighbors = set()

    modified_graph = graph.copy()

    modified_graph.add_node(node_title)

    for ref in references:
        if ref in modified_graph and ref != node_title:
            modified_graph.add_edge(node_title, ref)

    neighbors = get_neighbors_for_visualizer(modified_graph, node_title, depth=depth)

    return neighbors


def get_neighbors_for_visualizer(graph: nx.Graph, start_node, depth=1):
    """
    Returns the neighbors of a node within a given depth in a format
    compatible with Cytoscape-style visualizers.

    Args:
        graph (nx.Graph): The source NetworkX graph.
        start_node: The title/ID of the node to start from.
        depth (int): How many hops (degrees of separation) to traverse.

    Returns:
        dict: A dictionary containing 'nodes' and 'edges' formatted for the visualizer.
    """

    # 1. Create a subgraph of neighbors within the specified depth
    # If the node doesn't exist, return empty structure or raise error
    if start_node not in graph:
        return {"nodes": [], "edges": []}

    subgraph = nx.ego_graph(graph, start_node, radius=depth)

    # 2. Prepare data structures
    nodes_data = []
    edges_data = []

    # Helper to map actual node names (titles) to integer IDs required by the format
    # The example uses 1-based integers for IDs.
    node_to_id_map = {}
    current_id = 1

    # 3. Process Nodes
    for node in subgraph.nodes():
        # Assign an integer ID
        node_to_id_map[node] = current_id

        # Get attributes (safely default if label is missing)
        # We ignore 'embedding' as requested
        node_attrs = subgraph.nodes[node]
        label = node_attrs.get("label", "Unknown")

        node_obj = {
            "data": {
                "id": current_id,
                "label": label,
                "name": str(node),  # Using the node title/ID as 'name'
            }
        }
        nodes_data.append(node_obj)
        current_id += 1

    # 4. Process Edges
    # Edge IDs usually need to be unique strings or integers.
    # We continue the counter from where nodes left off to ensure uniqueness.
    edge_id_counter = current_id

    for u, v in subgraph.edges():
        source_id = node_to_id_map[u]
        target_id = node_to_id_map[v]

        # Get edge attributes if they exist (e.g., relationship type)
        edge_attrs = subgraph.edges[u, v]
        edge_label = edge_attrs.get("label", "CITES")  # Default label if none exists

        edge_obj = {
            "data": {
                "id": edge_id_counter,
                "label": edge_label,
                "source": source_id,
                "target": target_id,
            }
        }
        edges_data.append(edge_obj)
        edge_id_counter += 1

    # 5. Return the final structure
    return {"nodes": nodes_data, "edges": edges_data}


if __name__ == "__main__":
    data = pd.read_parquet(
        r"C:\Users\pc\Desktop\Projects\Masters\data_mining\semantic_knowledge_graph\demo\input\train_data_with_embeddings.parquet"
    )
    graph = create_graph_from_df(data)

    test_title = "Sample Article Title"
    test_references = ["finansal matematik", "genel yapay zekâ", "andrej karpathy"]

    neighbors = gather_neighbors(graph, test_title, test_references, depth=2)

    # print(f"References for '{test_title}': {test_references}")
    print(f"Neighbors of '{test_title}': {neighbors}")