Spaces:
Paused
Paused
| from flask import Flask, request, jsonify | |
| from scrapy import Spider, Request | |
| from scrapy.crawler import CrawlerRunner | |
| from scrapy.utils.project import get_project_settings | |
| from twisted.internet import reactor | |
| from twisted.internet.defer import inlineCallbacks, returnValue, Deferred | |
| from urllib.parse import urljoin, urlparse | |
| import json | |
| import threading | |
| import time | |
| import logging | |
| import traceback | |
| from queue import Queue | |
| from functools import wraps | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| datefmt='%Y-%m-%d %H:%M:%S' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| app = Flask(__name__) | |
| # Thread-safe queue for results | |
| result_queue = Queue() | |
| class URLSpider(Spider): | |
| name = 'url_spider' | |
| found_urls = set() # Class variable to store all found URLs | |
| def __init__(self, start_url=None, max_urls=10, *args, **kwargs): | |
| super(URLSpider, self).__init__(*args, **kwargs) | |
| self.start_urls = [start_url] | |
| self.allowed_domain = urlparse(start_url).netloc | |
| self.max_urls = max_urls | |
| self.url_count = 0 | |
| logger.info(f"Starting spider for URL: {start_url} with max_urls={max_urls}") | |
| def start_requests(self): | |
| for url in self.start_urls: | |
| yield Request(url, callback=self.parse, dont_filter=True, errback=self.handle_error) | |
| def handle_error(self, failure): | |
| logger.error(f"Request failed: {failure.value}") | |
| return None | |
| def parse(self, response): | |
| try: | |
| if self.url_count >= self.max_urls: | |
| logger.info(f"Reached maximum URL limit ({self.max_urls}). Stopping crawl.") | |
| return | |
| links = response.css('a::attr(href)').getall() | |
| logger.info(f"Found {len(links)} links on {response.url}") | |
| for link in links: | |
| if self.url_count >= self.max_urls: | |
| return | |
| absolute_url = urljoin(response.url, link) | |
| parsed_url = urlparse(absolute_url) | |
| if parsed_url.netloc == self.allowed_domain and absolute_url not in self.found_urls: | |
| self.found_urls.add(absolute_url) | |
| self.url_count += 1 | |
| logger.info(f"Found URL ({self.url_count}/{self.max_urls}): {absolute_url}") | |
| if self.url_count < self.max_urls: | |
| logger.info(f"Following link: {absolute_url}") | |
| yield Request(absolute_url, callback=self.parse, errback=self.handle_error) | |
| except Exception as e: | |
| logger.error(f"Error in parse method: {str(e)}") | |
| traceback.print_exc() | |
| def run_spider(url, max_urls): | |
| try: | |
| settings = get_project_settings() | |
| settings.update({ | |
| 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'LOG_ENABLED': True, | |
| 'LOG_LEVEL': 'INFO', | |
| 'ROBOTSTXT_OBEY': True, | |
| 'CONCURRENT_REQUESTS': 16, | |
| 'DOWNLOAD_TIMEOUT': 30, | |
| 'RETRY_TIMES': 3, | |
| }) | |
| runner = CrawlerRunner(settings) | |
| # Create a deferred to store the results | |
| results = {'urls': set()} | |
| def crawler_callback(result): | |
| try: | |
| # Get URLs from the spider's class variable | |
| urls = list(URLSpider.found_urls) | |
| logger.info(f"Crawling completed. Found {len(urls)} URLs.") | |
| # Put the results in the queue | |
| result_queue.put({ | |
| 'status': 'success', | |
| 'urls': urls, | |
| 'count': len(urls) | |
| }) | |
| return result | |
| except Exception as e: | |
| logger.error(f"Error in crawler_callback: {str(e)}") | |
| traceback.print_exc() | |
| result_queue.put({ | |
| 'status': 'error', | |
| 'error': str(e), | |
| 'urls': [] | |
| }) | |
| return result | |
| # Run the spider | |
| logger.info("Starting crawler...") | |
| try: | |
| deferred = runner.crawl(URLSpider, start_url=url, max_urls=max_urls) | |
| deferred.addCallback(crawler_callback) | |
| deferred.addErrback(lambda f: result_queue.put({ | |
| 'status': 'error', | |
| 'error': str(f.value), | |
| 'urls': [] | |
| })) | |
| return deferred | |
| except Exception as e: | |
| logger.error(f"Error starting crawler: {str(e)}") | |
| traceback.print_exc() | |
| result_queue.put({ | |
| 'status': 'error', | |
| 'error': str(e), | |
| 'urls': [] | |
| }) | |
| return None | |
| except Exception as e: | |
| logger.error(f"Error in run_spider: {str(e)}") | |
| traceback.print_exc() | |
| result_queue.put({'error': str(e)}) | |
| def scrape_url(): | |
| try: | |
| data = request.get_json() | |
| if not data: | |
| logger.error("No JSON data provided in request") | |
| return jsonify({'error': 'No JSON data provided'}), 400 | |
| url = data.get('url') | |
| max_urls = data.get('max_urls', 50) | |
| if not url: | |
| logger.error("No URL provided in request") | |
| return jsonify({'error': 'URL is required'}), 400 | |
| logger.info(f"Received scrape request for URL: {url} with max_urls={max_urls}") | |
| # Run the spider in the reactor thread | |
| reactor.callFromThread(run_spider, url, max_urls) | |
| # Wait for results with timeout | |
| try: | |
| result = result_queue.get(timeout=60) | |
| if 'error' in result: | |
| logger.error(f"Scraping error: {result['error']}") | |
| return jsonify({'error': 'Failed to scrape URL', 'details': {'error': result['error']}}), 500 | |
| return jsonify(result) | |
| except Exception as e: | |
| logger.error(f"Timeout waiting for results: {str(e)}") | |
| return jsonify({'error': 'Scraping timed out'}), 500 | |
| except Exception as e: | |
| logger.error(f"Error during scraping: {str(e)}") | |
| traceback.print_exc() | |
| return jsonify({'error': str(e)}), 500 | |
| def health_check(): | |
| return jsonify({'status': 'ok'}) | |
| def run_reactor(): | |
| reactor.run(installSignalHandlers=False) | |
| # Start reactor in a separate thread when the app starts | |
| if not reactor.running: | |
| reactor_thread = threading.Thread(target=run_reactor, daemon=True) | |
| reactor_thread.start() | |
| class ContentSpider(Spider): | |
| name = 'content_spider' | |
| content_results = {} # Class variable to store content results | |
| def __init__(self, urls=None, *args, **kwargs): | |
| super(ContentSpider, self).__init__(*args, **kwargs) | |
| self.start_urls = urls if urls else [] | |
| logger.info(f"Starting content spider for {len(self.start_urls)} URLs") | |
| def parse(self, response): | |
| try: | |
| # Extract title | |
| title = response.css('title::text').get() or '' | |
| # Extract main content (this is a simple example, adjust selectors as needed) | |
| content = ' '.join(response.css('p::text, h1::text, h2::text, h3::text, h4::text, h5::text, h6::text').getall()) | |
| # Store the result | |
| self.content_results[response.url] = { | |
| 'title': title, | |
| 'content': content[:20000] + '...' if len(content) > 20000 else content, # Limit content length | |
| 'status': 'success' | |
| } | |
| logger.info(f"Scraped content from {response.url}") | |
| except Exception as e: | |
| logger.error(f"Error scraping content from {response.url}: {str(e)}") | |
| self.content_results[response.url] = { | |
| 'title': '', | |
| 'content': '', | |
| 'status': 'error', | |
| 'error': str(e) | |
| } | |
| def run_content_spider(urls): | |
| try: | |
| settings = get_project_settings() | |
| settings.update({ | |
| 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'LOG_ENABLED': True, | |
| 'LOG_LEVEL': 'INFO', | |
| 'ROBOTSTXT_OBEY': True, | |
| 'CONCURRENT_REQUESTS': 16, | |
| 'DOWNLOAD_TIMEOUT': 30, | |
| 'RETRY_TIMES': 3, | |
| }) | |
| runner = CrawlerRunner(settings) | |
| def content_crawler_callback(result): | |
| try: | |
| # Get content results from the spider's class variable | |
| content_results = ContentSpider.content_results | |
| logger.info(f"Content scraping completed for {len(content_results)} URLs.") | |
| # Put the results in the queue | |
| result_queue.put({ | |
| 'status': 'success', | |
| 'results': content_results | |
| }) | |
| return result | |
| except Exception as e: | |
| logger.error(f"Error in content_crawler_callback: {str(e)}") | |
| traceback.print_exc() | |
| result_queue.put({ | |
| 'status': 'error', | |
| 'error': str(e), | |
| 'results': {} | |
| }) | |
| return result | |
| # Run the spider | |
| logger.info("Starting content crawler...") | |
| try: | |
| deferred = runner.crawl(ContentSpider, urls=urls) | |
| deferred.addCallback(content_crawler_callback) | |
| deferred.addErrback(lambda f: result_queue.put({ | |
| 'status': 'error', | |
| 'error': str(f.value), | |
| 'results': {} | |
| })) | |
| return deferred | |
| except Exception as e: | |
| logger.error(f"Error starting content crawler: {str(e)}") | |
| traceback.print_exc() | |
| result_queue.put({ | |
| 'status': 'error', | |
| 'error': str(e), | |
| 'results': {} | |
| }) | |
| return None | |
| except Exception as e: | |
| logger.error(f"Error in run_content_spider: {str(e)}") | |
| traceback.print_exc() | |
| result_queue.put({'error': str(e)}) | |
| def scrape_content(): | |
| try: | |
| data = request.get_json() | |
| if not data: | |
| logger.error("No JSON data provided in request") | |
| return jsonify({'error': 'No JSON data provided'}), 400 | |
| urls = data.get('urls', []) | |
| if not urls: | |
| logger.error("No URLs provided in request") | |
| return jsonify({'error': 'URLs are required'}), 400 | |
| logger.info(f"Received content scrape request for {len(urls)} URLs") | |
| # Run the content spider in the reactor thread | |
| reactor.callFromThread(run_content_spider, urls) | |
| # Wait for results with timeout | |
| try: | |
| result = result_queue.get(timeout=60) | |
| if 'error' in result: | |
| logger.error(f"Content scraping error: {result['error']}") | |
| return jsonify({'error': 'Failed to scrape content', 'details': {'error': result['error']}}), 500 | |
| return jsonify(result) | |
| except Exception as e: | |
| logger.error(f"Timeout waiting for content results: {str(e)}") | |
| return jsonify({'error': 'Content scraping timed out'}), 500 | |
| except Exception as e: | |
| logger.error(f"Error during content scraping: {str(e)}") | |
| traceback.print_exc() | |
| return jsonify({'error': str(e)}), 500 | |
| if __name__ == '__main__': | |
| logger.info("Starting URL Scraper API on port 5001") | |
| app.run(host='0.0.0.0', port=5001, threaded=True, use_reloader=False) |