url-scraper / app.py
NanobotzAI's picture
Update app.py
97ec75c verified
from flask import Flask, request, jsonify
from scrapy import Spider, Request
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
from twisted.internet.defer import inlineCallbacks, returnValue, Deferred
from urllib.parse import urljoin, urlparse
import json
import threading
import time
import logging
import traceback
from queue import Queue
from functools import wraps
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
app = Flask(__name__)
# Thread-safe queue for results
result_queue = Queue()
class URLSpider(Spider):
name = 'url_spider'
found_urls = set() # Class variable to store all found URLs
def __init__(self, start_url=None, max_urls=10, *args, **kwargs):
super(URLSpider, self).__init__(*args, **kwargs)
self.start_urls = [start_url]
self.allowed_domain = urlparse(start_url).netloc
self.max_urls = max_urls
self.url_count = 0
logger.info(f"Starting spider for URL: {start_url} with max_urls={max_urls}")
def start_requests(self):
for url in self.start_urls:
yield Request(url, callback=self.parse, dont_filter=True, errback=self.handle_error)
def handle_error(self, failure):
logger.error(f"Request failed: {failure.value}")
return None
def parse(self, response):
try:
if self.url_count >= self.max_urls:
logger.info(f"Reached maximum URL limit ({self.max_urls}). Stopping crawl.")
return
links = response.css('a::attr(href)').getall()
logger.info(f"Found {len(links)} links on {response.url}")
for link in links:
if self.url_count >= self.max_urls:
return
absolute_url = urljoin(response.url, link)
parsed_url = urlparse(absolute_url)
if parsed_url.netloc == self.allowed_domain and absolute_url not in self.found_urls:
self.found_urls.add(absolute_url)
self.url_count += 1
logger.info(f"Found URL ({self.url_count}/{self.max_urls}): {absolute_url}")
if self.url_count < self.max_urls:
logger.info(f"Following link: {absolute_url}")
yield Request(absolute_url, callback=self.parse, errback=self.handle_error)
except Exception as e:
logger.error(f"Error in parse method: {str(e)}")
traceback.print_exc()
def run_spider(url, max_urls):
try:
settings = get_project_settings()
settings.update({
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'LOG_ENABLED': True,
'LOG_LEVEL': 'INFO',
'ROBOTSTXT_OBEY': True,
'CONCURRENT_REQUESTS': 16,
'DOWNLOAD_TIMEOUT': 30,
'RETRY_TIMES': 3,
})
runner = CrawlerRunner(settings)
# Create a deferred to store the results
results = {'urls': set()}
def crawler_callback(result):
try:
# Get URLs from the spider's class variable
urls = list(URLSpider.found_urls)
logger.info(f"Crawling completed. Found {len(urls)} URLs.")
# Put the results in the queue
result_queue.put({
'status': 'success',
'urls': urls,
'count': len(urls)
})
return result
except Exception as e:
logger.error(f"Error in crawler_callback: {str(e)}")
traceback.print_exc()
result_queue.put({
'status': 'error',
'error': str(e),
'urls': []
})
return result
# Run the spider
logger.info("Starting crawler...")
try:
deferred = runner.crawl(URLSpider, start_url=url, max_urls=max_urls)
deferred.addCallback(crawler_callback)
deferred.addErrback(lambda f: result_queue.put({
'status': 'error',
'error': str(f.value),
'urls': []
}))
return deferred
except Exception as e:
logger.error(f"Error starting crawler: {str(e)}")
traceback.print_exc()
result_queue.put({
'status': 'error',
'error': str(e),
'urls': []
})
return None
except Exception as e:
logger.error(f"Error in run_spider: {str(e)}")
traceback.print_exc()
result_queue.put({'error': str(e)})
@app.route('/scrape', methods=['POST'])
def scrape_url():
try:
data = request.get_json()
if not data:
logger.error("No JSON data provided in request")
return jsonify({'error': 'No JSON data provided'}), 400
url = data.get('url')
max_urls = data.get('max_urls', 50)
if not url:
logger.error("No URL provided in request")
return jsonify({'error': 'URL is required'}), 400
logger.info(f"Received scrape request for URL: {url} with max_urls={max_urls}")
# Run the spider in the reactor thread
reactor.callFromThread(run_spider, url, max_urls)
# Wait for results with timeout
try:
result = result_queue.get(timeout=60)
if 'error' in result:
logger.error(f"Scraping error: {result['error']}")
return jsonify({'error': 'Failed to scrape URL', 'details': {'error': result['error']}}), 500
return jsonify(result)
except Exception as e:
logger.error(f"Timeout waiting for results: {str(e)}")
return jsonify({'error': 'Scraping timed out'}), 500
except Exception as e:
logger.error(f"Error during scraping: {str(e)}")
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/health', methods=['GET'])
def health_check():
return jsonify({'status': 'ok'})
def run_reactor():
reactor.run(installSignalHandlers=False)
# Start reactor in a separate thread when the app starts
if not reactor.running:
reactor_thread = threading.Thread(target=run_reactor, daemon=True)
reactor_thread.start()
class ContentSpider(Spider):
name = 'content_spider'
content_results = {} # Class variable to store content results
def __init__(self, urls=None, *args, **kwargs):
super(ContentSpider, self).__init__(*args, **kwargs)
self.start_urls = urls if urls else []
logger.info(f"Starting content spider for {len(self.start_urls)} URLs")
def parse(self, response):
try:
# Extract title
title = response.css('title::text').get() or ''
# Extract main content (this is a simple example, adjust selectors as needed)
content = ' '.join(response.css('p::text, h1::text, h2::text, h3::text, h4::text, h5::text, h6::text').getall())
# Store the result
self.content_results[response.url] = {
'title': title,
'content': content[:20000] + '...' if len(content) > 20000 else content, # Limit content length
'status': 'success'
}
logger.info(f"Scraped content from {response.url}")
except Exception as e:
logger.error(f"Error scraping content from {response.url}: {str(e)}")
self.content_results[response.url] = {
'title': '',
'content': '',
'status': 'error',
'error': str(e)
}
def run_content_spider(urls):
try:
settings = get_project_settings()
settings.update({
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'LOG_ENABLED': True,
'LOG_LEVEL': 'INFO',
'ROBOTSTXT_OBEY': True,
'CONCURRENT_REQUESTS': 16,
'DOWNLOAD_TIMEOUT': 30,
'RETRY_TIMES': 3,
})
runner = CrawlerRunner(settings)
def content_crawler_callback(result):
try:
# Get content results from the spider's class variable
content_results = ContentSpider.content_results
logger.info(f"Content scraping completed for {len(content_results)} URLs.")
# Put the results in the queue
result_queue.put({
'status': 'success',
'results': content_results
})
return result
except Exception as e:
logger.error(f"Error in content_crawler_callback: {str(e)}")
traceback.print_exc()
result_queue.put({
'status': 'error',
'error': str(e),
'results': {}
})
return result
# Run the spider
logger.info("Starting content crawler...")
try:
deferred = runner.crawl(ContentSpider, urls=urls)
deferred.addCallback(content_crawler_callback)
deferred.addErrback(lambda f: result_queue.put({
'status': 'error',
'error': str(f.value),
'results': {}
}))
return deferred
except Exception as e:
logger.error(f"Error starting content crawler: {str(e)}")
traceback.print_exc()
result_queue.put({
'status': 'error',
'error': str(e),
'results': {}
})
return None
except Exception as e:
logger.error(f"Error in run_content_spider: {str(e)}")
traceback.print_exc()
result_queue.put({'error': str(e)})
@app.route('/scrape-content', methods=['POST'])
def scrape_content():
try:
data = request.get_json()
if not data:
logger.error("No JSON data provided in request")
return jsonify({'error': 'No JSON data provided'}), 400
urls = data.get('urls', [])
if not urls:
logger.error("No URLs provided in request")
return jsonify({'error': 'URLs are required'}), 400
logger.info(f"Received content scrape request for {len(urls)} URLs")
# Run the content spider in the reactor thread
reactor.callFromThread(run_content_spider, urls)
# Wait for results with timeout
try:
result = result_queue.get(timeout=60)
if 'error' in result:
logger.error(f"Content scraping error: {result['error']}")
return jsonify({'error': 'Failed to scrape content', 'details': {'error': result['error']}}), 500
return jsonify(result)
except Exception as e:
logger.error(f"Timeout waiting for content results: {str(e)}")
return jsonify({'error': 'Content scraping timed out'}), 500
except Exception as e:
logger.error(f"Error during content scraping: {str(e)}")
traceback.print_exc()
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
logger.info("Starting URL Scraper API on port 5001")
app.run(host='0.0.0.0', port=5001, threaded=True, use_reloader=False)