Em4e commited on
Commit
9f08712
·
verified ·
1 Parent(s): fbf7416

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -110
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
- import grape
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
  import plotly.express as px
@@ -25,9 +25,9 @@ st.set_page_config(
25
  if 'www_graph_cache' not in st.session_state:
26
  st.session_state.www_graph_cache = None
27
 
28
- def load_graph_from_csv_grape(file_content, file_name):
29
  """
30
- Load page links from CSV file using Grape.
31
  """
32
  try:
33
  # Read CSV content
@@ -60,20 +60,14 @@ def load_graph_from_csv_grape(file_content, file_name):
60
  all_nodes = list(set(df['FROM'].tolist() + df['TO'].tolist()))
61
  node_to_idx = {node: i for i, node in enumerate(all_nodes)}
62
 
63
- # Create edge list with indices
64
- edge_list = []
 
 
65
  for _, row in df.iterrows():
66
  source_idx = node_to_idx[row['FROM']]
67
  target_idx = node_to_idx[row['TO']]
68
- edge_list.append((source_idx, target_idx))
69
-
70
- # Create Grape graph
71
- G = grape.Graph.from_edge_list(
72
- edge_list=edge_list,
73
- directed=True,
74
- node_names=[str(i) for i in range(len(all_nodes))],
75
- name=f"graph_{file_name}"
76
- )
77
 
78
  return G, all_nodes, node_to_idx
79
 
@@ -82,9 +76,9 @@ def load_graph_from_csv_grape(file_content, file_name):
82
  st.info("💡 **Tip**: Make sure your file is a valid CSV with FROM and TO columns for page links")
83
  return None, None, None
84
 
85
- def create_www_graph_grape(n_nodes, m_edges, seed=42):
86
  """
87
- Create a realistic internet simulation using Grape.
88
  """
89
  cache_key = (n_nodes, m_edges, seed)
90
 
@@ -96,61 +90,31 @@ def create_www_graph_grape(n_nodes, m_edges, seed=42):
96
  random.seed(seed)
97
  np.random.seed(seed)
98
 
99
- # Create Barabási-Albert graph manually since Grape doesn't have this built-in
100
- # Start with a complete graph of m_edges nodes
101
- edges = []
102
- for i in range(m_edges):
103
- for j in range(i + 1, m_edges):
104
- edges.append((i, j))
105
- edges.append((j, i)) # Make it directed
106
-
107
- # Add remaining nodes with preferential attachment
108
- degrees = [2 * m_edges] * m_edges # Initial degrees
109
-
110
- for new_node in range(m_edges, n_nodes):
111
- # Select m_edges nodes to connect to based on preferential attachment
112
- total_degree = sum(degrees)
113
- targets = set()
114
-
115
- while len(targets) < min(m_edges, new_node):
116
- # Probability proportional to degree
117
- rand_val = random.random() * total_degree
118
- cumsum = 0
119
- for i, degree in enumerate(degrees):
120
- cumsum += degree
121
- if cumsum >= rand_val and i not in targets:
122
- targets.add(i)
123
- break
124
-
125
- # Add edges
126
- for target in targets:
127
- edges.append((new_node, target))
128
- edges.append((target, new_node)) # Bidirectional
129
-
130
- # Update degrees
131
- degrees.append(2 * len(targets))
132
- for target in targets:
133
- degrees[target] += 2
134
-
135
- # Create Grape graph
136
- www_graph = grape.Graph.from_edge_list(
137
- edge_list=edges,
138
- directed=True,
139
- node_names=[str(i) for i in range(n_nodes)],
140
- name="www_simulation"
141
- )
142
 
143
  # Cache the result
144
  st.session_state.www_graph_cache = (cache_key, www_graph)
145
  return www_graph
146
 
147
- def process_configuration_grape(www_graph, kalicube_graph, kalicube_nodes,
148
- min_connections=5, max_connections=50):
149
  """
150
- Test how your page network performs in the real internet using Grape.
151
  """
152
  # Get WWW graph info
153
- www_node_count = www_graph.get_number_of_nodes()
154
  kalicube_node_count = len(kalicube_nodes)
155
 
156
  # Create node mapping for kalicube nodes
@@ -161,20 +125,23 @@ def process_configuration_grape(www_graph, kalicube_graph, kalicube_nodes,
161
  new_node_id = kalicube_offset + i
162
  kalicube_node_mapping[node] = new_node_id
163
 
164
- # Get edges from both graphs
165
- www_edges = www_graph.get_edge_list()
166
- kalicube_edges = kalicube_graph.get_edge_list()
167
 
168
- # Convert kalicube edges to use new node IDs
169
- kalicube_mapped_edges = []
170
- kalicube_idx_to_node = {i: node for node, i in kalicube_graph.get_node_name_to_node_id_map().items()}
171
 
172
- for source_idx, target_idx in kalicube_edges:
173
- source_node = kalicube_idx_to_node[source_idx]
174
- target_node = kalicube_idx_to_node[target_idx]
 
 
 
175
  new_source_id = kalicube_node_mapping[source_node]
176
  new_target_id = kalicube_node_mapping[target_node]
177
- kalicube_mapped_edges.append((new_source_id, new_target_id))
178
 
179
  # Randomly connect kalicube pages to WWW
180
  n_connections = min(min_connections, www_node_count, kalicube_node_count)
@@ -182,35 +149,20 @@ def process_configuration_grape(www_graph, kalicube_graph, kalicube_nodes,
182
  www_sample = random.sample(range(www_node_count), n_connections)
183
  kalicube_sample = random.sample(list(kalicube_node_mapping.values()), n_connections)
184
 
185
- connection_edges = []
186
  for www_node, kalicube_node in zip(www_sample, kalicube_sample):
187
- connection_edges.append((www_node, kalicube_node))
188
-
189
- # Combine all edges
190
- all_edges = list(www_edges) + kalicube_mapped_edges + connection_edges
191
- total_nodes = www_node_count + kalicube_node_count
192
-
193
- # Create merged graph
194
- merged_graph = grape.Graph.from_edge_list(
195
- edge_list=all_edges,
196
- directed=True,
197
- node_names=[str(i) for i in range(total_nodes)],
198
- name="merged_simulation"
199
- )
200
 
201
- # Calculate PageRank
202
  try:
203
- pagerank_values = merged_graph.pagerank(
204
- damping_factor=0.85,
205
- maximum_iterations=100,
206
- tolerance=1e-6
207
- )
208
  except Exception as e:
209
  st.warning(f"PageRank calculation failed: {e}. Using degree centrality instead.")
210
  # Fallback to degree centrality
211
- degrees = merged_graph.get_node_degrees()
212
- total_degree = sum(degrees)
213
- pagerank_values = [deg / total_degree if total_degree > 0 else 0 for deg in degrees]
214
 
215
  # Extract PageRank values for kalicube nodes
216
  pagerank_dict = {}
@@ -279,15 +231,15 @@ def run_single_simulation(simulation_id, kalicube_graph_old, kalicube_graph_new,
279
  np.random.seed(sim_seed)
280
 
281
  # Create internet simulation
282
- www_graph = create_www_graph_grape(www_nodes, www_edges, sim_seed)
283
 
284
  # Test original setup
285
- importance_old_dict = process_configuration_grape(
286
  www_graph, kalicube_graph_old, kalicube_nodes_old, min_conn, max_conn
287
  )
288
 
289
  # Test new setup
290
- importance_new_dict = process_configuration_grape(
291
  www_graph, kalicube_graph_new, kalicube_nodes_new, min_conn, max_conn
292
  )
293
 
@@ -393,7 +345,7 @@ def create_simple_visualizations(results_df, all_comparisons_df, confidence_thre
393
  delta="per test")
394
 
395
  def main():
396
- st.title("🔗 Page Link Impact Analyzer (Powered by Grape)")
397
  st.markdown("**Find out if your page link changes will help or hurt your search rankings**")
398
 
399
  # Simple intro
@@ -404,7 +356,7 @@ def main():
404
 
405
  **What you need:** Two CSV files - one with your current page links, one with your planned changes.
406
 
407
- 🍇 **Now powered by Grape** - A high-performance graph library for faster and more efficient analysis!
408
  """)
409
 
410
  # Sidebar - simplified
@@ -470,10 +422,10 @@ def main():
470
  # Load and validate files
471
  with st.spinner("Reading your files..."):
472
  kalicube_graph_old, kalicube_nodes_old, kalicube_url_mapping_old = \
473
- load_graph_from_csv_grape(old_content, old_file.name)
474
 
475
  kalicube_graph_new, kalicube_nodes_new, kalicube_url_mapping_new = \
476
- load_graph_from_csv_grape(new_content, new_file.name)
477
 
478
  if kalicube_graph_old is not None and kalicube_graph_new is not None:
479
  # Show what we found
@@ -484,14 +436,14 @@ def main():
484
  st.info(f"""
485
  **Current Setup:**
486
  - {len(kalicube_nodes_old)} pages
487
- - {kalicube_graph_old.get_number_of_edges()} links between them
488
  """)
489
 
490
  with info_col2:
491
  st.info(f"""
492
  **Planned Setup:**
493
  - {len(kalicube_nodes_new)} pages
494
- - {kalicube_graph_new.get_number_of_edges()} links between them
495
  """)
496
 
497
  # Big, obvious run button
@@ -652,8 +604,8 @@ def main():
652
  ### 🎯 **Why This Works**
653
  Instead of guessing, you get data-driven confidence about your page link changes!
654
 
655
- ### 🍇 **Powered by Grape**
656
- This version uses Grape, a high-performance graph library that's much faster than traditional tools for analyzing large networks.
657
  """)
658
 
659
  with st.expander("❓ **Common Questions**"):
@@ -662,7 +614,7 @@ def main():
662
  A: The tool shows trends and probabilities, not exact predictions. It's like weather forecasting - very useful for planning!
663
 
664
  **Q: How long does it take?**
665
- A: Usually 30 seconds to 2 minutes, depending on your settings. Grape makes it faster than before!
666
 
667
  **Q: What if I get yellow results?**
668
  A: Yellow means proceed carefully. Consider running more tests, getting expert advice, or monitoring closely if you implement.
@@ -676,8 +628,8 @@ def main():
676
  **Q: What's the difference between pages and websites?**
677
  A: Pages are specific URLs (like mysite.com/about), while websites are domains (like mysite.com). This tool analyzes individual page links.
678
 
679
- **Q: What's new with Grape?**
680
- A: Grape is a high-performance graph library that makes calculations much faster and can handle larger datasets more efficiently than NetworkX.
681
  """)
682
 
683
  if __name__ == "__main__":
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
+ import networkit as nk
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
  import plotly.express as px
 
25
  if 'www_graph_cache' not in st.session_state:
26
  st.session_state.www_graph_cache = None
27
 
28
+ def load_graph_from_csv_networkit(file_content, file_name):
29
  """
30
+ Load page links from CSV file using NetworKit.
31
  """
32
  try:
33
  # Read CSV content
 
60
  all_nodes = list(set(df['FROM'].tolist() + df['TO'].tolist()))
61
  node_to_idx = {node: i for i, node in enumerate(all_nodes)}
62
 
63
+ # Create NetworKit graph
64
+ G = nk.Graph(n=len(all_nodes), weighted=False, directed=True)
65
+
66
+ # Add edges
67
  for _, row in df.iterrows():
68
  source_idx = node_to_idx[row['FROM']]
69
  target_idx = node_to_idx[row['TO']]
70
+ G.addEdge(source_idx, target_idx)
 
 
 
 
 
 
 
 
71
 
72
  return G, all_nodes, node_to_idx
73
 
 
76
  st.info("💡 **Tip**: Make sure your file is a valid CSV with FROM and TO columns for page links")
77
  return None, None, None
78
 
79
+ def create_www_graph_networkit(n_nodes, m_edges, seed=42):
80
  """
81
+ Create a realistic internet simulation using NetworKit.
82
  """
83
  cache_key = (n_nodes, m_edges, seed)
84
 
 
90
  random.seed(seed)
91
  np.random.seed(seed)
92
 
93
+ # Create Barabási-Albert graph using NetworKit's generator
94
+ generator = nk.generators.BarabasiAlbertGenerator(k=m_edges, nMax=n_nodes, n0=m_edges)
95
+ generator.setSeed(seed, False)
96
+ www_graph = generator.generate()
97
+
98
+ # Make it directed
99
+ if not www_graph.isDirected():
100
+ # Convert to directed by creating a new directed graph
101
+ directed_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)
102
+ for u, v in www_graph.iterEdges():
103
+ directed_graph.addEdge(u, v)
104
+ directed_graph.addEdge(v, u) # Make bidirectional
105
+ www_graph = directed_graph
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  # Cache the result
108
  st.session_state.www_graph_cache = (cache_key, www_graph)
109
  return www_graph
110
 
111
+ def process_configuration_networkit(www_graph, kalicube_graph, kalicube_nodes,
112
+ min_connections=5, max_connections=50):
113
  """
114
+ Test how your page network performs in the real internet using NetworKit.
115
  """
116
  # Get WWW graph info
117
+ www_node_count = www_graph.numberOfNodes()
118
  kalicube_node_count = len(kalicube_nodes)
119
 
120
  # Create node mapping for kalicube nodes
 
125
  new_node_id = kalicube_offset + i
126
  kalicube_node_mapping[node] = new_node_id
127
 
128
+ # Create merged graph
129
+ total_nodes = www_node_count + kalicube_node_count
130
+ merged_graph = nk.Graph(n=total_nodes, weighted=False, directed=True)
131
 
132
+ # Add WWW edges
133
+ for u, v in www_graph.iterEdges():
134
+ merged_graph.addEdge(u, v)
135
 
136
+ # Add kalicube edges with new node IDs
137
+ kalicube_idx_to_node = {i: node for i, node in enumerate(kalicube_nodes)}
138
+
139
+ for u, v in kalicube_graph.iterEdges():
140
+ source_node = kalicube_idx_to_node[u]
141
+ target_node = kalicube_idx_to_node[v]
142
  new_source_id = kalicube_node_mapping[source_node]
143
  new_target_id = kalicube_node_mapping[target_node]
144
+ merged_graph.addEdge(new_source_id, new_target_id)
145
 
146
  # Randomly connect kalicube pages to WWW
147
  n_connections = min(min_connections, www_node_count, kalicube_node_count)
 
149
  www_sample = random.sample(range(www_node_count), n_connections)
150
  kalicube_sample = random.sample(list(kalicube_node_mapping.values()), n_connections)
151
 
 
152
  for www_node, kalicube_node in zip(www_sample, kalicube_sample):
153
+ merged_graph.addEdge(www_node, kalicube_node)
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ # Calculate PageRank using NetworKit
156
  try:
157
+ pagerank_algo = nk.centrality.PageRank(merged_graph, damp=0.85, tol=1e-6)
158
+ pagerank_algo.run()
159
+ pagerank_values = pagerank_algo.scores()
 
 
160
  except Exception as e:
161
  st.warning(f"PageRank calculation failed: {e}. Using degree centrality instead.")
162
  # Fallback to degree centrality
163
+ degree_algo = nk.centrality.DegreeCentrality(merged_graph, normalized=True)
164
+ degree_algo.run()
165
+ pagerank_values = degree_algo.scores()
166
 
167
  # Extract PageRank values for kalicube nodes
168
  pagerank_dict = {}
 
231
  np.random.seed(sim_seed)
232
 
233
  # Create internet simulation
234
+ www_graph = create_www_graph_networkit(www_nodes, www_edges, sim_seed)
235
 
236
  # Test original setup
237
+ importance_old_dict = process_configuration_networkit(
238
  www_graph, kalicube_graph_old, kalicube_nodes_old, min_conn, max_conn
239
  )
240
 
241
  # Test new setup
242
+ importance_new_dict = process_configuration_networkit(
243
  www_graph, kalicube_graph_new, kalicube_nodes_new, min_conn, max_conn
244
  )
245
 
 
345
  delta="per test")
346
 
347
  def main():
348
+ st.title("🔗 Page Link Impact Analyzer (Powered by NetworKit)")
349
  st.markdown("**Find out if your page link changes will help or hurt your search rankings**")
350
 
351
  # Simple intro
 
356
 
357
  **What you need:** Two CSV files - one with your current page links, one with your planned changes.
358
 
359
+ **Now powered by NetworKit** - A high-performance network analysis toolkit for faster and more efficient analysis!
360
  """)
361
 
362
  # Sidebar - simplified
 
422
  # Load and validate files
423
  with st.spinner("Reading your files..."):
424
  kalicube_graph_old, kalicube_nodes_old, kalicube_url_mapping_old = \
425
+ load_graph_from_csv_networkit(old_content, old_file.name)
426
 
427
  kalicube_graph_new, kalicube_nodes_new, kalicube_url_mapping_new = \
428
+ load_graph_from_csv_networkit(new_content, new_file.name)
429
 
430
  if kalicube_graph_old is not None and kalicube_graph_new is not None:
431
  # Show what we found
 
436
  st.info(f"""
437
  **Current Setup:**
438
  - {len(kalicube_nodes_old)} pages
439
+ - {kalicube_graph_old.numberOfEdges()} links between them
440
  """)
441
 
442
  with info_col2:
443
  st.info(f"""
444
  **Planned Setup:**
445
  - {len(kalicube_nodes_new)} pages
446
+ - {kalicube_graph_new.numberOfEdges()} links between them
447
  """)
448
 
449
  # Big, obvious run button
 
604
  ### 🎯 **Why This Works**
605
  Instead of guessing, you get data-driven confidence about your page link changes!
606
 
607
+ ### **Powered by NetworKit**
608
+ This version uses NetworKit, a high-performance network analysis toolkit that's much faster than traditional tools for analyzing large networks.
609
  """)
610
 
611
  with st.expander("❓ **Common Questions**"):
 
614
  A: The tool shows trends and probabilities, not exact predictions. It's like weather forecasting - very useful for planning!
615
 
616
  **Q: How long does it take?**
617
+ A: Usually 30 seconds to 2 minutes, depending on your settings. NetworKit makes it faster than before!
618
 
619
  **Q: What if I get yellow results?**
620
  A: Yellow means proceed carefully. Consider running more tests, getting expert advice, or monitoring closely if you implement.
 
628
  **Q: What's the difference between pages and websites?**
629
  A: Pages are specific URLs (like mysite.com/about), while websites are domains (like mysite.com). This tool analyzes individual page links.
630
 
631
+ **Q: What's NetworKit?**
632
+ A: NetworKit is a high-performance network analysis toolkit with optimized C++ algorithms that makes calculations much faster and can handle larger datasets more efficiently.
633
  """)
634
 
635
  if __name__ == "__main__":