from flask import Flask, request, jsonify from scrapy import Spider, Request from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from twisted.internet import reactor from twisted.internet.defer import inlineCallbacks, returnValue, Deferred from urllib.parse import urljoin, urlparse import json import threading import time import logging import traceback from queue import Queue from functools import wraps # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) app = Flask(__name__) # Thread-safe queue for results result_queue = Queue() class URLSpider(Spider): name = 'url_spider' found_urls = set() # Class variable to store all found URLs def __init__(self, start_url=None, max_urls=10, *args, **kwargs): super(URLSpider, self).__init__(*args, **kwargs) self.start_urls = [start_url] self.allowed_domain = urlparse(start_url).netloc self.max_urls = max_urls self.url_count = 0 logger.info(f"Starting spider for URL: {start_url} with max_urls={max_urls}") def start_requests(self): for url in self.start_urls: yield Request(url, callback=self.parse, dont_filter=True, errback=self.handle_error) def handle_error(self, failure): logger.error(f"Request failed: {failure.value}") return None def parse(self, response): try: if self.url_count >= self.max_urls: logger.info(f"Reached maximum URL limit ({self.max_urls}). Stopping crawl.") return links = response.css('a::attr(href)').getall() logger.info(f"Found {len(links)} links on {response.url}") for link in links: if self.url_count >= self.max_urls: return absolute_url = urljoin(response.url, link) parsed_url = urlparse(absolute_url) if parsed_url.netloc == self.allowed_domain and absolute_url not in self.found_urls: self.found_urls.add(absolute_url) self.url_count += 1 logger.info(f"Found URL ({self.url_count}/{self.max_urls}): {absolute_url}") if self.url_count < self.max_urls: logger.info(f"Following link: {absolute_url}") yield Request(absolute_url, callback=self.parse, errback=self.handle_error) except Exception as e: logger.error(f"Error in parse method: {str(e)}") traceback.print_exc() def run_spider(url, max_urls): try: settings = get_project_settings() settings.update({ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'LOG_ENABLED': True, 'LOG_LEVEL': 'INFO', 'ROBOTSTXT_OBEY': True, 'CONCURRENT_REQUESTS': 16, 'DOWNLOAD_TIMEOUT': 30, 'RETRY_TIMES': 3, }) runner = CrawlerRunner(settings) # Create a deferred to store the results results = {'urls': set()} def crawler_callback(result): try: # Get URLs from the spider's class variable urls = list(URLSpider.found_urls) logger.info(f"Crawling completed. Found {len(urls)} URLs.") # Put the results in the queue result_queue.put({ 'status': 'success', 'urls': urls, 'count': len(urls) }) return result except Exception as e: logger.error(f"Error in crawler_callback: {str(e)}") traceback.print_exc() result_queue.put({ 'status': 'error', 'error': str(e), 'urls': [] }) return result # Run the spider logger.info("Starting crawler...") try: deferred = runner.crawl(URLSpider, start_url=url, max_urls=max_urls) deferred.addCallback(crawler_callback) deferred.addErrback(lambda f: result_queue.put({ 'status': 'error', 'error': str(f.value), 'urls': [] })) return deferred except Exception as e: logger.error(f"Error starting crawler: {str(e)}") traceback.print_exc() result_queue.put({ 'status': 'error', 'error': str(e), 'urls': [] }) return None except Exception as e: logger.error(f"Error in run_spider: {str(e)}") traceback.print_exc() result_queue.put({'error': str(e)}) @app.route('/scrape', methods=['POST']) def scrape_url(): try: data = request.get_json() if not data: logger.error("No JSON data provided in request") return jsonify({'error': 'No JSON data provided'}), 400 url = data.get('url') max_urls = data.get('max_urls', 50) if not url: logger.error("No URL provided in request") return jsonify({'error': 'URL is required'}), 400 logger.info(f"Received scrape request for URL: {url} with max_urls={max_urls}") # Run the spider in the reactor thread reactor.callFromThread(run_spider, url, max_urls) # Wait for results with timeout try: result = result_queue.get(timeout=60) if 'error' in result: logger.error(f"Scraping error: {result['error']}") return jsonify({'error': 'Failed to scrape URL', 'details': {'error': result['error']}}), 500 return jsonify(result) except Exception as e: logger.error(f"Timeout waiting for results: {str(e)}") return jsonify({'error': 'Scraping timed out'}), 500 except Exception as e: logger.error(f"Error during scraping: {str(e)}") traceback.print_exc() return jsonify({'error': str(e)}), 500 @app.route('/health', methods=['GET']) def health_check(): return jsonify({'status': 'ok'}) def run_reactor(): reactor.run(installSignalHandlers=False) # Start reactor in a separate thread when the app starts if not reactor.running: reactor_thread = threading.Thread(target=run_reactor, daemon=True) reactor_thread.start() class ContentSpider(Spider): name = 'content_spider' content_results = {} # Class variable to store content results def __init__(self, urls=None, *args, **kwargs): super(ContentSpider, self).__init__(*args, **kwargs) self.start_urls = urls if urls else [] logger.info(f"Starting content spider for {len(self.start_urls)} URLs") def parse(self, response): try: # Extract title title = response.css('title::text').get() or '' # Extract main content (this is a simple example, adjust selectors as needed) content = ' '.join(response.css('p::text, h1::text, h2::text, h3::text, h4::text, h5::text, h6::text').getall()) # Store the result self.content_results[response.url] = { 'title': title, 'content': content[:20000] + '...' if len(content) > 20000 else content, # Limit content length 'status': 'success' } logger.info(f"Scraped content from {response.url}") except Exception as e: logger.error(f"Error scraping content from {response.url}: {str(e)}") self.content_results[response.url] = { 'title': '', 'content': '', 'status': 'error', 'error': str(e) } def run_content_spider(urls): try: settings = get_project_settings() settings.update({ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'LOG_ENABLED': True, 'LOG_LEVEL': 'INFO', 'ROBOTSTXT_OBEY': True, 'CONCURRENT_REQUESTS': 16, 'DOWNLOAD_TIMEOUT': 30, 'RETRY_TIMES': 3, }) runner = CrawlerRunner(settings) def content_crawler_callback(result): try: # Get content results from the spider's class variable content_results = ContentSpider.content_results logger.info(f"Content scraping completed for {len(content_results)} URLs.") # Put the results in the queue result_queue.put({ 'status': 'success', 'results': content_results }) return result except Exception as e: logger.error(f"Error in content_crawler_callback: {str(e)}") traceback.print_exc() result_queue.put({ 'status': 'error', 'error': str(e), 'results': {} }) return result # Run the spider logger.info("Starting content crawler...") try: deferred = runner.crawl(ContentSpider, urls=urls) deferred.addCallback(content_crawler_callback) deferred.addErrback(lambda f: result_queue.put({ 'status': 'error', 'error': str(f.value), 'results': {} })) return deferred except Exception as e: logger.error(f"Error starting content crawler: {str(e)}") traceback.print_exc() result_queue.put({ 'status': 'error', 'error': str(e), 'results': {} }) return None except Exception as e: logger.error(f"Error in run_content_spider: {str(e)}") traceback.print_exc() result_queue.put({'error': str(e)}) @app.route('/scrape-content', methods=['POST']) def scrape_content(): try: data = request.get_json() if not data: logger.error("No JSON data provided in request") return jsonify({'error': 'No JSON data provided'}), 400 urls = data.get('urls', []) if not urls: logger.error("No URLs provided in request") return jsonify({'error': 'URLs are required'}), 400 logger.info(f"Received content scrape request for {len(urls)} URLs") # Run the content spider in the reactor thread reactor.callFromThread(run_content_spider, urls) # Wait for results with timeout try: result = result_queue.get(timeout=60) if 'error' in result: logger.error(f"Content scraping error: {result['error']}") return jsonify({'error': 'Failed to scrape content', 'details': {'error': result['error']}}), 500 return jsonify(result) except Exception as e: logger.error(f"Timeout waiting for content results: {str(e)}") return jsonify({'error': 'Content scraping timed out'}), 500 except Exception as e: logger.error(f"Error during content scraping: {str(e)}") traceback.print_exc() return jsonify({'error': str(e)}), 500 if __name__ == '__main__': logger.info("Starting URL Scraper API on port 5001") app.run(host='0.0.0.0', port=5001, threaded=True, use_reloader=False)