Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Script to crawl subpages from Tranco URLs: | |
| - Reads URLs from tranco_processed.csv | |
| - Crawls each domain to find up to 10 subpages | |
| - Creates new dataset with subpage URLs and label 0 | |
| """ | |
| import pandas as pd | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| import time | |
| import os | |
| from tqdm import tqdm | |
| import logging | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import threading | |
| # Setup logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def get_domain(url): | |
| """Extract domain from URL""" | |
| parsed = urlparse(url) | |
| return f"{parsed.scheme}://{parsed.netloc}" | |
| def is_same_domain(url, base_url): | |
| """Check if URL belongs to the same domain as base_url""" | |
| return urlparse(url).netloc == urlparse(base_url).netloc | |
| def crawl_subpages(base_url, max_subpages=10, timeout=10): | |
| """ | |
| Crawl a website to find subpages | |
| Args: | |
| base_url: Base URL to crawl | |
| max_subpages: Maximum number of subpages to collect | |
| timeout: Request timeout in seconds | |
| Returns: | |
| List of subpage URLs | |
| """ | |
| subpages = set() | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| try: | |
| # Get the main page | |
| response = requests.get(base_url, headers=headers, timeout=timeout, allow_redirects=True) | |
| response.raise_for_status() | |
| # Parse HTML | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Find all links | |
| links = soup.find_all('a', href=True) | |
| for link in links: | |
| if len(subpages) >= max_subpages: | |
| break | |
| href = link['href'] | |
| # Convert relative URLs to absolute | |
| full_url = urljoin(base_url, str(href)) | |
| # Only include URLs from the same domain | |
| if is_same_domain(full_url, base_url): | |
| # Remove fragments | |
| parsed = urlparse(full_url) | |
| clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" | |
| if parsed.query: | |
| clean_url += f"?{parsed.query}" | |
| # Avoid duplicates and the base URL itself | |
| if clean_url != base_url and clean_url not in subpages: | |
| subpages.add(clean_url) | |
| return list(subpages)[:max_subpages] | |
| except requests.exceptions.Timeout: | |
| logger.warning(f"Timeout while crawling {base_url}") | |
| return [] | |
| except requests.exceptions.RequestException as e: | |
| logger.warning(f"Error crawling {base_url}: {str(e)}") | |
| return [] | |
| except Exception as e: | |
| logger.warning(f"Unexpected error crawling {base_url}: {str(e)}") | |
| return [] | |
| def crawl_dataset(input_file, output_file, max_subpages_per_url=10, max_urls=None, delay=1, num_threads=10): | |
| """ | |
| Crawl all URLs in dataset to find subpages | |
| Args: | |
| input_file: Path to input CSV file | |
| output_file: Path to output CSV file | |
| max_subpages_per_url: Maximum subpages to collect per URL | |
| max_urls: Maximum number of URLs to process (None for all) | |
| delay: Delay between requests in seconds | |
| num_threads: Number of concurrent threads for crawling | |
| """ | |
| # Read input file | |
| logger.info(f"Reading {input_file}...") | |
| df = pd.read_csv(input_file) | |
| if max_urls: | |
| df = df.head(max_urls) | |
| logger.info(f"Processing first {max_urls} URLs") | |
| logger.info(f"Dataset contains {len(df)} URLs") | |
| logger.info(f"Using {num_threads} threads for concurrent crawling") | |
| # Collect all subpages | |
| all_subpages = [] | |
| lock = threading.Lock() | |
| def process_url(row): | |
| """Process a single URL with delay""" | |
| base_url = row['url'] | |
| logger.info(f"Crawling {base_url}...") | |
| subpages = crawl_subpages(base_url, max_subpages=max_subpages_per_url) | |
| results = [] | |
| if subpages: | |
| logger.info(f"Found {len(subpages)} subpages for {base_url}") | |
| for subpage in subpages: | |
| results.append({ | |
| 'url': subpage, | |
| 'label': 0, # Legitimate | |
| # 'source_url': base_url | |
| }) | |
| else: | |
| logger.warning(f"No subpages found for {base_url}") | |
| # Delay to be respectful to servers | |
| time.sleep(delay) | |
| return results | |
| # Use ThreadPoolExecutor for concurrent crawling | |
| with ThreadPoolExecutor(max_workers=num_threads) as executor: | |
| # Submit all tasks | |
| future_to_url = {executor.submit(process_url, row): row['url'] | |
| for _, row in df.iterrows()} | |
| # Process completed tasks with progress bar | |
| with tqdm(total=len(df), desc="Crawling URLs") as pbar: | |
| for future in as_completed(future_to_url): | |
| try: | |
| results = future.result() | |
| with lock: | |
| all_subpages.extend(results) | |
| except Exception as e: | |
| url = future_to_url[future] | |
| logger.error(f"Error processing {url}: {str(e)}") | |
| finally: | |
| pbar.update(1) | |
| # Create DataFrame with all subpages | |
| result_df = pd.DataFrame(all_subpages) | |
| logger.info(f"\nTotal subpages collected: {len(result_df)}") | |
| logger.info(f"Saving to {output_file}...") | |
| # Save to CSV | |
| result_df.to_csv(output_file, index=False) | |
| logger.info("Crawling complete!") | |
| logger.info(f"\nFirst few rows:\n{result_df.head(10)}") | |
| logger.info(f"\nDataset statistics:") | |
| logger.info(f"Total URLs: {len(result_df)}") | |
| logger.info(f"Unique source domains: {result_df['source_url'].nunique()}") | |
| return result_df | |
| if __name__ == "__main__": | |
| # Define paths | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| project_root = os.path.dirname(script_dir) | |
| input_file = os.path.join(project_root, 'data', 'raw', 'tranco_processed2.csv') | |
| output_file = os.path.join(project_root, 'data', 'raw', 'tranco_subpages2.csv') | |
| # Crawl dataset | |
| # Process first 100 URLs for testing (remove max_urls=100 to process all) | |
| crawl_dataset( | |
| input_file=input_file, | |
| output_file=output_file, | |
| max_subpages_per_url=10, | |
| # max_urls=100, | |
| delay=1, | |
| num_threads=10 # Adjust based on your needs (10-20 is usually good) | |
| ) | |