import urllib.parse import scrapy from scrapy.crawler import CrawlerProcess from bs4 import BeautifulSoup import json from module_1_pre.funcs import generate_resume_urls, generate_specific_urls class ResumeSpider(scrapy.Spider): name = "resume_spider" custom_settings = { 'RETRY_HTTP_CODES': [400], 'RETRY_TIMES': 5, 'LOG_ENABLED': False } def __init__(self, base_url=None,*args, **kwargs): super(ResumeSpider, self).__init__(*args, **kwargs) self.base_url = base_url def start_requests(self): urls = generate_resume_urls(self.base_url) for url in urls: yield scrapy.Request(url=url, callback=self.parse_general) def parse_general(self, response): soup = BeautifulSoup(response.text, 'html.parser') main_element = soup.find('main', class_='resume-serp-content') if main_element and main_element.find('div', attrs={'data-resume-id': True}): page_count = self.get_page_count(soup) if page_count < 250: result = { 'url': response.url, 'pages': page_count } with open('resume_urls.json', 'a') as f: json.dump(result, f, ensure_ascii=False, indent=4) f.write('\n') else: self.base_url = response.url specific_urls = generate_specific_urls(self.base_url) for url in specific_urls: yield scrapy.Request(url=url, callback=self.parse_specific) else: print(f'No resumes found on page: {response.url}') def parse_specific(self, response): soup = BeautifulSoup(response.text, 'html.parser') page_count = self.get_page_count(soup) result = { 'url': response.url, 'pages': page_count } with open('resume_urls.json', 'a', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=4, separators=(',', ': ')) f.write(',\n') # Add comma and newline for valid JSON array def get_page_count(self, soup): pager = soup.find('div', class_='pager') if pager: last_page_link = pager.find_all('a', attrs={'data-qa': 'pager-page'})[-1] last_page = int(last_page_link.text.strip()) return last_page return 1 def stage1(base_url): process = CrawlerProcess() process.crawl(ResumeSpider,base_url) process.start() # for url in urls: # result = generate_resume_urls(url) # print(result) # print("\n") # lens = len(['https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=unknown&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=not_looking_for_job&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=looking_for_offers&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=active_search&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=has_job_offer&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=accepted_job_offer&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=moreThan6&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=between3And6&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=between1And3&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=noExperience&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&gender=male&label=only_with_gender&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&gender=female&label=only_with_gender&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=higher&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=unfinished_higher&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=master&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=bachelor&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=special_secondary&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80']) # print(lens)