Update app.py
Browse files
app.py
CHANGED
|
@@ -27,11 +27,16 @@ if 'www_graph_cache' not in st.session_state:
|
|
| 27 |
|
| 28 |
def load_graph_from_csv_networkit(file_content, file_name):
|
| 29 |
"""
|
| 30 |
-
Load page links from CSV file using NetworKit.
|
| 31 |
"""
|
| 32 |
try:
|
| 33 |
-
# Read CSV content
|
| 34 |
-
df = pd.read_csv(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# Check required columns with user-friendly names
|
| 37 |
required_cols = ['FROM', 'TO']
|
|
@@ -47,27 +52,36 @@ def load_graph_from_csv_networkit(file_content, file_name):
|
|
| 47 |
""")
|
| 48 |
return None, None, None
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
| 52 |
-
df
|
| 53 |
-
df['TO'] = df['TO'].astype(str)
|
| 54 |
|
| 55 |
if len(df) == 0:
|
| 56 |
st.error(f"❌ No valid page links found in {file_name}")
|
| 57 |
return None, None, None
|
| 58 |
|
| 59 |
-
#
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
node_to_idx = {node: i for i, node in enumerate(all_nodes)}
|
| 62 |
|
| 63 |
# Create NetworKit graph
|
| 64 |
G = nk.Graph(n=len(all_nodes), weighted=False, directed=True)
|
| 65 |
|
| 66 |
-
#
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
return G, all_nodes, node_to_idx
|
| 73 |
|
|
@@ -418,14 +432,26 @@ def main():
|
|
| 418 |
if www_nodes >= 500000:
|
| 419 |
st.sidebar.warning(f"""
|
| 420 |
⚠️ **Performance Warning**:
|
| 421 |
-
{internet_size} will be very slow!
|
| 422 |
-
Expect
|
| 423 |
Consider using fewer tests.
|
| 424 |
""")
|
| 425 |
elif www_nodes >= 250000:
|
| 426 |
st.sidebar.info(f"""
|
| 427 |
-
ℹ️ **Note**: {internet_size} may take
|
| 428 |
-
30-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
""")
|
| 430 |
|
| 431 |
# Advanced settings (hidden by default)
|
|
@@ -654,14 +680,16 @@ def main():
|
|
| 654 |
Instead of guessing, you get data-driven confidence about your page link changes!
|
| 655 |
|
| 656 |
### ⚡ **Powered by NetworKit**
|
| 657 |
-
This version uses NetworKit, a high-performance network analysis toolkit that's much faster than traditional tools for analyzing large networks. It
|
| 658 |
|
| 659 |
-
### 🔬 **Large-Scale Simulations**
|
| 660 |
- **100K sites**: ~10-30 seconds per test
|
| 661 |
-
- **250K sites**: ~30-
|
| 662 |
-
- **500K sites**: ~
|
| 663 |
-
- **750K sites**: ~
|
| 664 |
-
- **1M sites**: ~
|
|
|
|
|
|
|
| 665 |
""")
|
| 666 |
|
| 667 |
with st.expander("❓ **Common Questions**"):
|
|
@@ -685,13 +713,16 @@ def main():
|
|
| 685 |
A: Pages are specific URLs (like mysite.com/about), while websites are domains (like mysite.com). This tool analyzes individual page links.
|
| 686 |
|
| 687 |
**Q: What's NetworKit?**
|
| 688 |
-
A: NetworKit is a high-performance network analysis toolkit with optimized C++ algorithms
|
|
|
|
|
|
|
|
|
|
| 689 |
|
| 690 |
**Q: Which simulation size should I choose?**
|
| 691 |
-
A: Start with 100K for testing. Use 250K-500K for realistic results. Only use 750K+ if you have time and want maximum realism.
|
| 692 |
|
| 693 |
-
**Q: Why does
|
| 694 |
-
A:
|
| 695 |
""")
|
| 696 |
|
| 697 |
if __name__ == "__main__":
|
|
|
|
| 27 |
|
| 28 |
def load_graph_from_csv_networkit(file_content, file_name):
|
| 29 |
"""
|
| 30 |
+
Load page links from CSV file using NetworKit - OPTIMIZED VERSION.
|
| 31 |
"""
|
| 32 |
try:
|
| 33 |
+
# Read CSV content with optimized settings
|
| 34 |
+
df = pd.read_csv(
|
| 35 |
+
StringIO(file_content),
|
| 36 |
+
dtype={'FROM': 'string', 'TO': 'string'}, # Specify types upfront
|
| 37 |
+
na_filter=True, # Enable NA filtering
|
| 38 |
+
skip_blank_lines=True # Skip empty lines
|
| 39 |
+
)
|
| 40 |
|
| 41 |
# Check required columns with user-friendly names
|
| 42 |
required_cols = ['FROM', 'TO']
|
|
|
|
| 52 |
""")
|
| 53 |
return None, None, None
|
| 54 |
|
| 55 |
+
# Fast data cleaning - vectorized operations
|
| 56 |
+
initial_rows = len(df)
|
| 57 |
+
df = df.dropna(subset=['FROM', 'TO']) # Remove rows with missing values
|
|
|
|
| 58 |
|
| 59 |
if len(df) == 0:
|
| 60 |
st.error(f"❌ No valid page links found in {file_name}")
|
| 61 |
return None, None, None
|
| 62 |
|
| 63 |
+
# Show cleaning stats if significant data was removed
|
| 64 |
+
if initial_rows - len(df) > initial_rows * 0.1: # More than 10% removed
|
| 65 |
+
st.warning(f"⚠️ Removed {initial_rows - len(df)} rows with missing data from {file_name}")
|
| 66 |
+
|
| 67 |
+
# OPTIMIZED: Get unique nodes using pandas operations (much faster)
|
| 68 |
+
all_nodes_series = pd.concat([df['FROM'], df['TO']]).drop_duplicates()
|
| 69 |
+
all_nodes = all_nodes_series.tolist()
|
| 70 |
+
|
| 71 |
+
# OPTIMIZED: Create node mapping
|
| 72 |
node_to_idx = {node: i for i, node in enumerate(all_nodes)}
|
| 73 |
|
| 74 |
# Create NetworKit graph
|
| 75 |
G = nk.Graph(n=len(all_nodes), weighted=False, directed=True)
|
| 76 |
|
| 77 |
+
# OPTIMIZED: Vectorized edge addition (MAJOR SPEEDUP)
|
| 78 |
+
# Convert node names to indices using vectorized operations
|
| 79 |
+
source_indices = df['FROM'].map(node_to_idx).values
|
| 80 |
+
target_indices = df['TO'].map(node_to_idx).values
|
| 81 |
+
|
| 82 |
+
# Bulk add edges using numpy arrays (much faster than iterrows)
|
| 83 |
+
for src_idx, tgt_idx in zip(source_indices, target_indices):
|
| 84 |
+
G.addEdge(int(src_idx), int(tgt_idx))
|
| 85 |
|
| 86 |
return G, all_nodes, node_to_idx
|
| 87 |
|
|
|
|
| 432 |
if www_nodes >= 500000:
|
| 433 |
st.sidebar.warning(f"""
|
| 434 |
⚠️ **Performance Warning**:
|
| 435 |
+
{internet_size} with Barabási-Albert will be very slow!
|
| 436 |
+
Expect 4-15 minutes per test.
|
| 437 |
Consider using fewer tests.
|
| 438 |
""")
|
| 439 |
elif www_nodes >= 250000:
|
| 440 |
st.sidebar.info(f"""
|
| 441 |
+
ℹ️ **Note**: {internet_size} with Barabási-Albert may take
|
| 442 |
+
30-90 seconds per test.
|
| 443 |
+
""")
|
| 444 |
+
|
| 445 |
+
# Add Barabási-Albert info
|
| 446 |
+
with st.sidebar.expander("🔬 About Barabási-Albert Model"):
|
| 447 |
+
st.markdown("""
|
| 448 |
+
**Why Barabási-Albert?**
|
| 449 |
+
- Creates **scale-free networks** like the real web
|
| 450 |
+
- **Preferential attachment**: Popular pages get more links
|
| 451 |
+
- **Power-law distribution**: Most realistic web simulation
|
| 452 |
+
- Slower than other models but much more accurate
|
| 453 |
+
|
| 454 |
+
**Perfect for**: Testing how link changes affect rankings in realistic web conditions.
|
| 455 |
""")
|
| 456 |
|
| 457 |
# Advanced settings (hidden by default)
|
|
|
|
| 680 |
Instead of guessing, you get data-driven confidence about your page link changes!
|
| 681 |
|
| 682 |
### ⚡ **Powered by NetworKit**
|
| 683 |
+
This version uses NetworKit, a high-performance network analysis toolkit that's much faster than traditional tools for analyzing large networks. It uses the **Barabási-Albert model** to create realistic scale-free networks that mimic the actual structure of the web!
|
| 684 |
|
| 685 |
+
### 🔬 **Large-Scale Barabási-Albert Simulations**
|
| 686 |
- **100K sites**: ~10-30 seconds per test
|
| 687 |
+
- **250K sites**: ~30-90 seconds per test
|
| 688 |
+
- **500K sites**: ~2-5 minutes per test
|
| 689 |
+
- **750K sites**: ~4-8 minutes per test
|
| 690 |
+
- **1M sites**: ~6-15 minutes per test
|
| 691 |
+
|
| 692 |
+
**Note**: Barabási-Albert is more computationally intensive than other generators but produces the most realistic web-like structure with power-law degree distributions.
|
| 693 |
""")
|
| 694 |
|
| 695 |
with st.expander("❓ **Common Questions**"):
|
|
|
|
| 713 |
A: Pages are specific URLs (like mysite.com/about), while websites are domains (like mysite.com). This tool analyzes individual page links.
|
| 714 |
|
| 715 |
**Q: What's NetworKit?**
|
| 716 |
+
A: NetworKit is a high-performance network analysis toolkit with optimized C++ algorithms. This tool specifically uses the **Barabási-Albert model** to generate scale-free networks that accurately represent real web topology.
|
| 717 |
+
|
| 718 |
+
**Q: Why Barabási-Albert specifically?**
|
| 719 |
+
A: The Barabási-Albert model creates "scale-free" networks with preferential attachment - meaning popular pages get more links, just like the real web. This produces the most realistic simulation of how link changes affect rankings.
|
| 720 |
|
| 721 |
**Q: Which simulation size should I choose?**
|
| 722 |
+
A: Start with 100K for testing. Use 250K-500K for realistic results. Only use 750K+ if you have time and want maximum realism. Larger = more realistic but much slower.
|
| 723 |
|
| 724 |
+
**Q: Why does Barabási-Albert take longer than other generators?**
|
| 725 |
+
A: Barabási-Albert builds networks step-by-step with preferential attachment, which is more computationally intensive but produces much more realistic web-like structures than faster alternatives.
|
| 726 |
""")
|
| 727 |
|
| 728 |
if __name__ == "__main__":
|