Spaces:
Sleeping
Sleeping
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.chrome.options import Options | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| import time | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import threading | |
| # Create a lock for thread-safe operations | |
| visited_lock = threading.Lock() | |
| # Thread-safe set for visited URLs | |
| visited = set() | |
| # Function to scrape links with depth control | |
| def get_all_links(url, max_depth, current_depth=0): | |
| if current_depth > max_depth: | |
| return [] | |
| try: | |
| # Print the current URL being scraped | |
| print(f"Scraping: {url} at depth {current_depth}") | |
| # Set up Chrome options | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") # Run in headless mode | |
| # Set up the Chrome driver | |
| service = Service(ChromeDriverManager().install()) | |
| driver = webdriver.Chrome(service=service, options=chrome_options) | |
| # Navigate to the URL | |
| driver.get(url) | |
| # Wait for the page to load (adjust the sleep time if needed) | |
| time.sleep(5) | |
| # Get the page source and parse it with BeautifulSoup | |
| soup = BeautifulSoup(driver.page_source, 'html.parser') | |
| # Find all 'a' tags and extract the 'href' attribute | |
| links = set() | |
| for a_tag in soup.find_all('a', href=True): | |
| href = a_tag['href'] | |
| full_url = urljoin(url, href) | |
| # Only include links from the same domain and not already visited | |
| with visited_lock: | |
| if urlparse(full_url).netloc == urlparse(url).netloc and full_url not in visited: | |
| visited.add(full_url) | |
| links.add(full_url) | |
| # Close the browser | |
| driver.quit() | |
| return list(links) | |
| except Exception as e: | |
| print(f"Error fetching the URL: {e}") | |
| return [] | |
| def scrape_recursive(urls, max_depth, current_depth, executor): | |
| if current_depth > max_depth: | |
| return [] | |
| # Submit tasks for the URLs to the ThreadPoolExecutor | |
| futures = [executor.submit(get_all_links, url, max_depth, current_depth) for url in urls] | |
| all_links = set() | |
| for future in as_completed(futures): | |
| try: | |
| links = future.result() | |
| all_links.update(links) | |
| except Exception as e: | |
| print(f"Error in thread: {e}") | |
| # Recursively scrape the new set of links | |
| if current_depth + 1 <= max_depth: | |
| new_links = scrape_recursive(all_links, max_depth, current_depth + 1, executor) | |
| all_links.update(new_links) | |
| return all_links | |
| def main(): | |
| # Get input URL and depth from the user | |
| input_url = input("Enter the URL to scrape: ") | |
| max_depth = int(input("Enter the maximum depth: ")) | |
| # ThreadPoolExecutor for multithreading | |
| with ThreadPoolExecutor(max_workers=10) as executor: | |
| # Start scraping | |
| all_links = scrape_recursive([input_url], max_depth, 0, executor) | |
| # Save the results to links.txt | |
| with open("links.txt", "w") as file: | |
| for link in all_links: | |
| file.write(f"{link}\n") | |
| print(f"\nFound {len(all_links)} links on the page. Saved to links.txt.") | |
| if __name__ == "__main__": | |
| main() | |