Spaces:

sae8d
/

not-google

Running

App Files Files Community

not-google / crawler /settings.py

sae8d

Upload 31 files

3d599c8 verified about 1 month ago

raw

history blame contribute delete

2.84 kB

	import os
	from dotenv import load_dotenv

	load_dotenv()

	BOT_NAME = 'ir_crawler'

	SPIDER_MODULES = ['crawler.spiders']
	NEWSPIDER_MODULE = 'crawler.spiders'

	# Crawl responsibly by identifying yourself (and your website) on the user-agent
	USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'

	# Obey robots.txt rules
	ROBOTSTXT_OBEY = False

	# Configure maximum concurrent requests performed by Scrapy (default: 16)
	CONCURRENT_REQUESTS = 16

	# Configure a delay for requests for the same website (default: 0)
	DOWNLOAD_DELAY = 1.5

	# The download delay setting will honor only one of:
	#CONCURRENT_REQUESTS_PER_DOMAIN = 16
	#CONCURRENT_REQUESTS_PER_IP = 16

	# Disable cookies (enabled by default)
	#COOKIES_ENABLED = False

	# Disable Telnet Console (enabled by default)
	#TELNETCONSOLE_ENABLED = False

	# Override the default request headers:
	#DEFAULT_REQUEST_HEADERS = {
	# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	# 'Accept-Language': 'en',
	#}

	# Enable or disable spider middlewares
	# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
	#SPIDER_MIDDLEWARES = {
	# 'crawler.middlewares.IrCrawlerSpiderMiddleware': 543,
	#}

	# Enable or disable downloader middlewares
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
	#DOWNLOADER_MIDDLEWARES = {
	# 'crawler.middlewares.IrCrawlerDownloaderMiddleware': 543,
	#}

	# Enable or disable extensions
	# See https://docs.scrapy.org/en/latest/topics/extensions.html
	#EXTENSIONS = {
	# 'scrapy.extensions.telnet.TelnetConsole': None,
	#}

	# Configure item pipelines
	# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
	ITEM_PIPELINES = {
	'crawler.pipelines.SupabasePipeline': 300,
	}

	# Enable and configure the AutoThrottle extension (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
	AUTOTHROTTLE_ENABLED = True
	# The initial download delay
	AUTOTHROTTLE_START_DELAY = 1.5
	# The maximum download delay to be set in case of high latencies
	AUTOTHROTTLE_MAX_DELAY = 60
	# The average number of requests Scrapy should be sending in parallel to
	# each remote server
	AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
	# Enable showing throttling stats for every response received:
	#AUTOTHROTTLE_DEBUG = False

	# Enable and configure HTTP caching (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
	#HTTPCACHE_ENABLED = True
	#HTTPCACHE_EXPIRATION_SECS = 0
	#HTTPCACHE_DIR = 'httpcache'
	#HTTPCACHE_IGNORE_HTTP_CODES = []
	#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

	# Set settings for the crawler
	DEPTH_LIMIT = 5
	MAX_PAGES = int(os.getenv("MAX_PAGES", 3000))
	TARGET_DOMAIN = os.getenv("TARGET_DOMAIN", "www.cnn.com")