Spaces:
Sleeping
Sleeping
File size: 4,431 Bytes
d97a439 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import networkx as nx
import pandas as pd
def get_unique_article_titles(df: pd.DataFrame) -> list[str]:
unique_articles = df["article_title_processed"].unique()
unique_articles_sorted = sorted(unique_articles.tolist())
return unique_articles_sorted
def create_graph_from_df(df, directed: bool = False) -> nx.Graph:
G = nx.Graph()
for i, row in df.iterrows():
node_title = row["article_title_processed"]
node_class = row["predicted_topic"]
G.add_node(node_title, label=node_class, embedding=row["embedding"])
for i, row in df.iterrows():
node_title = row["article_title_processed"]
references = eval(row["links_processed"])
for ref in references:
if ref in G and ref != node_title:
G.add_edge(node_title, ref)
if not directed:
G.add_edge(ref, node_title)
return G
def gather_neighbors(
graph: nx.DiGraph, node_title: str, references: list[str], depth: int = 1
):
neighbors = set()
modified_graph = graph.copy()
modified_graph.add_node(node_title)
for ref in references:
if ref in modified_graph and ref != node_title:
modified_graph.add_edge(node_title, ref)
neighbors = get_neighbors_for_visualizer(modified_graph, node_title, depth=depth)
return neighbors
def get_neighbors_for_visualizer(graph: nx.Graph, start_node, depth=1):
"""
Returns the neighbors of a node within a given depth in a format
compatible with Cytoscape-style visualizers.
Args:
graph (nx.Graph): The source NetworkX graph.
start_node: The title/ID of the node to start from.
depth (int): How many hops (degrees of separation) to traverse.
Returns:
dict: A dictionary containing 'nodes' and 'edges' formatted for the visualizer.
"""
# 1. Create a subgraph of neighbors within the specified depth
# If the node doesn't exist, return empty structure or raise error
if start_node not in graph:
return {"nodes": [], "edges": []}
subgraph = nx.ego_graph(graph, start_node, radius=depth)
# 2. Prepare data structures
nodes_data = []
edges_data = []
# Helper to map actual node names (titles) to integer IDs required by the format
# The example uses 1-based integers for IDs.
node_to_id_map = {}
current_id = 1
# 3. Process Nodes
for node in subgraph.nodes():
# Assign an integer ID
node_to_id_map[node] = current_id
# Get attributes (safely default if label is missing)
# We ignore 'embedding' as requested
node_attrs = subgraph.nodes[node]
label = node_attrs.get("label", "Unknown")
node_obj = {
"data": {
"id": current_id,
"label": label,
"name": str(node), # Using the node title/ID as 'name'
}
}
nodes_data.append(node_obj)
current_id += 1
# 4. Process Edges
# Edge IDs usually need to be unique strings or integers.
# We continue the counter from where nodes left off to ensure uniqueness.
edge_id_counter = current_id
for u, v in subgraph.edges():
source_id = node_to_id_map[u]
target_id = node_to_id_map[v]
# Get edge attributes if they exist (e.g., relationship type)
edge_attrs = subgraph.edges[u, v]
edge_label = edge_attrs.get("label", "CITES") # Default label if none exists
edge_obj = {
"data": {
"id": edge_id_counter,
"label": edge_label,
"source": source_id,
"target": target_id,
}
}
edges_data.append(edge_obj)
edge_id_counter += 1
# 5. Return the final structure
return {"nodes": nodes_data, "edges": edges_data}
if __name__ == "__main__":
data = pd.read_parquet(
r"C:\Users\pc\Desktop\Projects\Masters\data_mining\semantic_knowledge_graph\demo\input\train_data_with_embeddings.parquet"
)
graph = create_graph_from_df(data)
test_title = "Sample Article Title"
test_references = ["finansal matematik", "genel yapay zekâ", "andrej karpathy"]
neighbors = gather_neighbors(graph, test_title, test_references, depth=2)
# print(f"References for '{test_title}': {test_references}")
print(f"Neighbors of '{test_title}': {neighbors}")
|