Phishing-Detection-System / scripts /data_collection /crawl_tranco_subpages.py
rb1337's picture
Upload 50 files
2cc7f91 verified
#!/usr/bin/env python3
"""
Script to crawl subpages from Tranco URLs:
- Reads URLs from tranco_processed.csv
- Crawls each domain to find up to 10 subpages
- Creates new dataset with subpage URLs and label 0
"""
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import os
from tqdm import tqdm
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def get_domain(url):
"""Extract domain from URL"""
parsed = urlparse(url)
return f"{parsed.scheme}://{parsed.netloc}"
def is_same_domain(url, base_url):
"""Check if URL belongs to the same domain as base_url"""
return urlparse(url).netloc == urlparse(base_url).netloc
def crawl_subpages(base_url, max_subpages=10, timeout=10):
"""
Crawl a website to find subpages
Args:
base_url: Base URL to crawl
max_subpages: Maximum number of subpages to collect
timeout: Request timeout in seconds
Returns:
List of subpage URLs
"""
subpages = set()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
# Get the main page
response = requests.get(base_url, headers=headers, timeout=timeout, allow_redirects=True)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.content, 'html.parser')
# Find all links
links = soup.find_all('a', href=True)
for link in links:
if len(subpages) >= max_subpages:
break
href = link['href']
# Convert relative URLs to absolute
full_url = urljoin(base_url, str(href))
# Only include URLs from the same domain
if is_same_domain(full_url, base_url):
# Remove fragments
parsed = urlparse(full_url)
clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
if parsed.query:
clean_url += f"?{parsed.query}"
# Avoid duplicates and the base URL itself
if clean_url != base_url and clean_url not in subpages:
subpages.add(clean_url)
return list(subpages)[:max_subpages]
except requests.exceptions.Timeout:
logger.warning(f"Timeout while crawling {base_url}")
return []
except requests.exceptions.RequestException as e:
logger.warning(f"Error crawling {base_url}: {str(e)}")
return []
except Exception as e:
logger.warning(f"Unexpected error crawling {base_url}: {str(e)}")
return []
def crawl_dataset(input_file, output_file, max_subpages_per_url=10, max_urls=None, delay=1, num_threads=10):
"""
Crawl all URLs in dataset to find subpages
Args:
input_file: Path to input CSV file
output_file: Path to output CSV file
max_subpages_per_url: Maximum subpages to collect per URL
max_urls: Maximum number of URLs to process (None for all)
delay: Delay between requests in seconds
num_threads: Number of concurrent threads for crawling
"""
# Read input file
logger.info(f"Reading {input_file}...")
df = pd.read_csv(input_file)
if max_urls:
df = df.head(max_urls)
logger.info(f"Processing first {max_urls} URLs")
logger.info(f"Dataset contains {len(df)} URLs")
logger.info(f"Using {num_threads} threads for concurrent crawling")
# Collect all subpages
all_subpages = []
lock = threading.Lock()
def process_url(row):
"""Process a single URL with delay"""
base_url = row['url']
logger.info(f"Crawling {base_url}...")
subpages = crawl_subpages(base_url, max_subpages=max_subpages_per_url)
results = []
if subpages:
logger.info(f"Found {len(subpages)} subpages for {base_url}")
for subpage in subpages:
results.append({
'url': subpage,
'label': 0, # Legitimate
# 'source_url': base_url
})
else:
logger.warning(f"No subpages found for {base_url}")
# Delay to be respectful to servers
time.sleep(delay)
return results
# Use ThreadPoolExecutor for concurrent crawling
with ThreadPoolExecutor(max_workers=num_threads) as executor:
# Submit all tasks
future_to_url = {executor.submit(process_url, row): row['url']
for _, row in df.iterrows()}
# Process completed tasks with progress bar
with tqdm(total=len(df), desc="Crawling URLs") as pbar:
for future in as_completed(future_to_url):
try:
results = future.result()
with lock:
all_subpages.extend(results)
except Exception as e:
url = future_to_url[future]
logger.error(f"Error processing {url}: {str(e)}")
finally:
pbar.update(1)
# Create DataFrame with all subpages
result_df = pd.DataFrame(all_subpages)
logger.info(f"\nTotal subpages collected: {len(result_df)}")
logger.info(f"Saving to {output_file}...")
# Save to CSV
result_df.to_csv(output_file, index=False)
logger.info("Crawling complete!")
logger.info(f"\nFirst few rows:\n{result_df.head(10)}")
logger.info(f"\nDataset statistics:")
logger.info(f"Total URLs: {len(result_df)}")
logger.info(f"Unique source domains: {result_df['source_url'].nunique()}")
return result_df
if __name__ == "__main__":
# Define paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
input_file = os.path.join(project_root, 'data', 'raw', 'tranco_processed2.csv')
output_file = os.path.join(project_root, 'data', 'raw', 'tranco_subpages2.csv')
# Crawl dataset
# Process first 100 URLs for testing (remove max_urls=100 to process all)
crawl_dataset(
input_file=input_file,
output_file=output_file,
max_subpages_per_url=10,
# max_urls=100,
delay=1,
num_threads=10 # Adjust based on your needs (10-20 is usually good)
)