Spaces:

rb1337
/

Phishing-Detection-System

Runtime error

App Files Files Community

Phishing-Detection-System / scripts /data_collection /crawl_tranco_subpages.py

rb1337

Upload 50 files

2cc7f91 verified about 2 months ago

raw

history blame contribute delete

6.97 kB

	#!/usr/bin/env python3
	"""
	Script to crawl subpages from Tranco URLs:
	- Reads URLs from tranco_processed.csv
	- Crawls each domain to find up to 10 subpages
	- Creates new dataset with subpage URLs and label 0
	"""

	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import time
	import os
	from tqdm import tqdm
	import logging
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import threading

	# Setup logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	def get_domain(url):
	"""Extract domain from URL"""
	parsed = urlparse(url)
	return f"{parsed.scheme}://{parsed.netloc}"

	def is_same_domain(url, base_url):
	"""Check if URL belongs to the same domain as base_url"""
	return urlparse(url).netloc == urlparse(base_url).netloc

	def crawl_subpages(base_url, max_subpages=10, timeout=10):
	"""
	Crawl a website to find subpages

	Args:
	base_url: Base URL to crawl
	max_subpages: Maximum number of subpages to collect
	timeout: Request timeout in seconds

	Returns:
	List of subpage URLs
	"""
	subpages = set()
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	try:
	# Get the main page
	response = requests.get(base_url, headers=headers, timeout=timeout, allow_redirects=True)
	response.raise_for_status()

	# Parse HTML
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find all links
	links = soup.find_all('a', href=True)

	for link in links:
	if len(subpages) >= max_subpages:
	break

	href = link['href']

	# Convert relative URLs to absolute
	full_url = urljoin(base_url, str(href))

	# Only include URLs from the same domain
	if is_same_domain(full_url, base_url):
	# Remove fragments
	parsed = urlparse(full_url)
	clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
	if parsed.query:
	clean_url += f"?{parsed.query}"

	# Avoid duplicates and the base URL itself
	if clean_url != base_url and clean_url not in subpages:
	subpages.add(clean_url)

	return list(subpages)[:max_subpages]

	except requests.exceptions.Timeout:
	logger.warning(f"Timeout while crawling {base_url}")
	return []
	except requests.exceptions.RequestException as e:
	logger.warning(f"Error crawling {base_url}: {str(e)}")
	return []
	except Exception as e:
	logger.warning(f"Unexpected error crawling {base_url}: {str(e)}")
	return []

	def crawl_dataset(input_file, output_file, max_subpages_per_url=10, max_urls=None, delay=1, num_threads=10):
	"""
	Crawl all URLs in dataset to find subpages

	Args:
	input_file: Path to input CSV file
	output_file: Path to output CSV file
	max_subpages_per_url: Maximum subpages to collect per URL
	max_urls: Maximum number of URLs to process (None for all)
	delay: Delay between requests in seconds
	num_threads: Number of concurrent threads for crawling
	"""
	# Read input file
	logger.info(f"Reading {input_file}...")
	df = pd.read_csv(input_file)

	if max_urls:
	df = df.head(max_urls)
	logger.info(f"Processing first {max_urls} URLs")

	logger.info(f"Dataset contains {len(df)} URLs")
	logger.info(f"Using {num_threads} threads for concurrent crawling")

	# Collect all subpages
	all_subpages = []
	lock = threading.Lock()

	def process_url(row):
	"""Process a single URL with delay"""
	base_url = row['url']
	logger.info(f"Crawling {base_url}...")

	subpages = crawl_subpages(base_url, max_subpages=max_subpages_per_url)

	results = []
	if subpages:
	logger.info(f"Found {len(subpages)} subpages for {base_url}")
	for subpage in subpages:
	results.append({
	'url': subpage,
	'label': 0, # Legitimate
	# 'source_url': base_url
	})
	else:
	logger.warning(f"No subpages found for {base_url}")

	# Delay to be respectful to servers
	time.sleep(delay)
	return results

	# Use ThreadPoolExecutor for concurrent crawling
	with ThreadPoolExecutor(max_workers=num_threads) as executor:
	# Submit all tasks
	future_to_url = {executor.submit(process_url, row): row['url']
	for _, row in df.iterrows()}

	# Process completed tasks with progress bar
	with tqdm(total=len(df), desc="Crawling URLs") as pbar:
	for future in as_completed(future_to_url):
	try:
	results = future.result()
	with lock:
	all_subpages.extend(results)
	except Exception as e:
	url = future_to_url[future]
	logger.error(f"Error processing {url}: {str(e)}")
	finally:
	pbar.update(1)

	# Create DataFrame with all subpages
	result_df = pd.DataFrame(all_subpages)

	logger.info(f"\nTotal subpages collected: {len(result_df)}")
	logger.info(f"Saving to {output_file}...")

	# Save to CSV
	result_df.to_csv(output_file, index=False)

	logger.info("Crawling complete!")
	logger.info(f"\nFirst few rows:\n{result_df.head(10)}")
	logger.info(f"\nDataset statistics:")
	logger.info(f"Total URLs: {len(result_df)}")
	logger.info(f"Unique source domains: {result_df['source_url'].nunique()}")

	return result_df

	if __name__ == "__main__":
	# Define paths
	script_dir = os.path.dirname(os.path.abspath(__file__))
	project_root = os.path.dirname(script_dir)
	input_file = os.path.join(project_root, 'data', 'raw', 'tranco_processed2.csv')
	output_file = os.path.join(project_root, 'data', 'raw', 'tranco_subpages2.csv')

	# Crawl dataset
	# Process first 100 URLs for testing (remove max_urls=100 to process all)
	crawl_dataset(
	input_file=input_file,
	output_file=output_file,
	max_subpages_per_url=10,
	# max_urls=100,
	delay=1,
	num_threads=10 # Adjust based on your needs (10-20 is usually good)
	)