link_multiplayer / funcs.py
daswer123's picture
Upload 4 files
f1c202b verified
import urllib.parse
import scrapy
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import json
from module_1_pre.funcs import generate_resume_urls, generate_specific_urls
class ResumeSpider(scrapy.Spider):
name = "resume_spider"
custom_settings = {
'RETRY_HTTP_CODES': [400],
'RETRY_TIMES': 5,
'LOG_ENABLED': False
}
def __init__(self, base_url=None,*args, **kwargs):
super(ResumeSpider, self).__init__(*args, **kwargs)
self.base_url = base_url
def start_requests(self):
urls = generate_resume_urls(self.base_url)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_general)
def parse_general(self, response):
soup = BeautifulSoup(response.text, 'html.parser')
main_element = soup.find('main', class_='resume-serp-content')
if main_element and main_element.find('div', attrs={'data-resume-id': True}):
page_count = self.get_page_count(soup)
if page_count < 250:
result = {
'url': response.url,
'pages': page_count
}
with open('resume_urls.json', 'a') as f:
json.dump(result, f, ensure_ascii=False, indent=4)
f.write('\n')
else:
self.base_url = response.url
specific_urls = generate_specific_urls(self.base_url)
for url in specific_urls:
yield scrapy.Request(url=url, callback=self.parse_specific)
else:
print(f'No resumes found on page: {response.url}')
def parse_specific(self, response):
soup = BeautifulSoup(response.text, 'html.parser')
page_count = self.get_page_count(soup)
result = {
'url': response.url,
'pages': page_count
}
with open('resume_urls.json', 'a', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=4, separators=(',', ': '))
f.write(',\n') # Add comma and newline for valid JSON array
def get_page_count(self, soup):
pager = soup.find('div', class_='pager')
if pager:
last_page_link = pager.find_all('a', attrs={'data-qa': 'pager-page'})[-1]
last_page = int(last_page_link.text.strip())
return last_page
return 1
def stage1(base_url):
process = CrawlerProcess()
process.crawl(ResumeSpider,base_url)
process.start()
# for url in urls:
# result = generate_resume_urls(url)
# print(result)
# print("\n")
# lens = len(['https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=unknown&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=not_looking_for_job&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=looking_for_offers&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=active_search&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=has_job_offer&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=accepted_job_offer&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=moreThan6&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=between3And6&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=between1And3&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=noExperience&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&gender=male&label=only_with_gender&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&gender=female&label=only_with_gender&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=higher&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=unfinished_higher&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=master&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=bachelor&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=special_secondary&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80'])
# print(lens)