Spaces:
Sleeping
Sleeping
| import urllib.parse | |
| import scrapy | |
| from scrapy.crawler import CrawlerProcess | |
| from bs4 import BeautifulSoup | |
| import json | |
| from module_1_pre.funcs import generate_resume_urls, generate_specific_urls | |
| class ResumeSpider(scrapy.Spider): | |
| name = "resume_spider" | |
| custom_settings = { | |
| 'RETRY_HTTP_CODES': [400], | |
| 'RETRY_TIMES': 5, | |
| 'LOG_ENABLED': False | |
| } | |
| def __init__(self, base_url=None,*args, **kwargs): | |
| super(ResumeSpider, self).__init__(*args, **kwargs) | |
| self.base_url = base_url | |
| def start_requests(self): | |
| urls = generate_resume_urls(self.base_url) | |
| for url in urls: | |
| yield scrapy.Request(url=url, callback=self.parse_general) | |
| def parse_general(self, response): | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| main_element = soup.find('main', class_='resume-serp-content') | |
| if main_element and main_element.find('div', attrs={'data-resume-id': True}): | |
| page_count = self.get_page_count(soup) | |
| if page_count < 250: | |
| result = { | |
| 'url': response.url, | |
| 'pages': page_count | |
| } | |
| with open('resume_urls.json', 'a') as f: | |
| json.dump(result, f, ensure_ascii=False, indent=4) | |
| f.write('\n') | |
| else: | |
| self.base_url = response.url | |
| specific_urls = generate_specific_urls(self.base_url) | |
| for url in specific_urls: | |
| yield scrapy.Request(url=url, callback=self.parse_specific) | |
| else: | |
| print(f'No resumes found on page: {response.url}') | |
| def parse_specific(self, response): | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| page_count = self.get_page_count(soup) | |
| result = { | |
| 'url': response.url, | |
| 'pages': page_count | |
| } | |
| with open('resume_urls.json', 'a', encoding='utf-8') as f: | |
| json.dump(result, f, ensure_ascii=False, indent=4, separators=(',', ': ')) | |
| f.write(',\n') # Add comma and newline for valid JSON array | |
| def get_page_count(self, soup): | |
| pager = soup.find('div', class_='pager') | |
| if pager: | |
| last_page_link = pager.find_all('a', attrs={'data-qa': 'pager-page'})[-1] | |
| last_page = int(last_page_link.text.strip()) | |
| return last_page | |
| return 1 | |
| def stage1(base_url): | |
| process = CrawlerProcess() | |
| process.crawl(ResumeSpider,base_url) | |
| process.start() | |
| # for url in urls: | |
| # result = generate_resume_urls(url) | |
| # print(result) | |
| # print("\n") | |
| # lens = len(['https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=unknown&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=not_looking_for_job&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=looking_for_offers&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=active_search&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=has_job_offer&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=accepted_job_offer&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=moreThan6&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=between3And6&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=between1And3&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=noExperience&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&gender=male&label=only_with_gender&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&gender=female&label=only_with_gender&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=higher&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=unfinished_higher&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=master&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=bachelor&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=special_secondary&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80']) | |
| # print(lens) | |