Spaces:
Sleeping
Sleeping
| import logging | |
| import sys | |
| from utils.asyncHandler import asyncHandler | |
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.common.keys import Keys | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from src.CodeRunAndModelTrain.constants import LINKED_IN_USER_NAME, LINKED_IN_USER_PASSWORD | |
| from src.CodeRunAndModelTrain.entity.config_entity import JobFetcherConfig | |
| from src.CodeRunAndModelTrain.entity.artifact_entity import JobFetcherArtifact | |
| from exception import MyException | |
| import pickle | |
| import time | |
| import csv | |
| import os | |
| logger = logging.getLogger(__name__) | |
| class JobFetcher: | |
| def __init__(self, job_fetcher_config: JobFetcherConfig): | |
| self.job_fetcher_config = job_fetcher_config | |
| chrome_options = webdriver.ChromeOptions() | |
| chrome_options.add_argument("--headless") | |
| chrome_options.add_argument("--no-sandbox") | |
| chrome_options.add_argument("--disable-dev-shm-usage") | |
| self.driver = webdriver.Chrome(options=chrome_options) | |
| self.wait = WebDriverWait(self.driver, self.job_fetcher_config.web_driver_wait) | |
| logger.info(f"Initialized JobFetcher with URL: {self.job_fetcher_config.target_url}") | |
| self.driver.get(self.job_fetcher_config.target_url) | |
| async def get_jobs(self, cookies, jobtile: str): | |
| logger.info("Adding cookies to the session") | |
| for cookie in cookies: | |
| try: | |
| self.driver.add_cookie(cookie) | |
| except Exception as e: | |
| logger.debug(f"Failed to add a cookie: {e}") | |
| self.driver.refresh() | |
| logger.info("Session refreshed with cookies") | |
| return await self.save_jobs(jobtile=jobtile) | |
| async def save_jobs(self, jobtile: str): | |
| logger.info(f"Starting job search for: {jobtile}") | |
| search_box = self.wait.until( | |
| EC.element_to_be_clickable( | |
| (By.CSS_SELECTOR, "input[data-testid='typeahead-input']") | |
| ) | |
| ) | |
| search_box.send_keys(jobtile) | |
| search_box.send_keys(Keys.ENTER) | |
| time.sleep(3) | |
| logger.info("Clicking on 'Jobs' tab") | |
| jobs_tab = self.wait.until( | |
| EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Jobs')]")) | |
| ) | |
| jobs_tab.click() | |
| time.sleep(5) | |
| logger.info("Waiting for job listings to load") | |
| jobs = self.wait.until( | |
| EC.presence_of_all_elements_located( | |
| (By.CSS_SELECTOR, "ul.semantic-search-results-list li") | |
| ) | |
| ) | |
| dirname = os.path.dirname(self.job_fetcher_config.saved_jobs_file_path) | |
| os.makedirs(dirname, exist_ok=True) | |
| logger.info(f"Found {len(jobs)} potential jobs. Scraping top 20.") | |
| self.job_fetcher_config.saved_jobs_file_path=os.path.join(dirname, jobtile+".csv") | |
| with open(self.job_fetcher_config.saved_jobs_file_path, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.writer(f) | |
| writer.writerow(["Title", "Company", "Job Link", "Apply Link", "Description", "img_link"]) | |
| for idx, job in enumerate(jobs[:20]): | |
| try: | |
| self.driver.execute_script( | |
| "arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", job | |
| ) | |
| time.sleep(1) | |
| job.click() | |
| time.sleep(4) | |
| # β JOB LINK (Simulating Share -> Copy link) | |
| job_link = "N/A" | |
| try: | |
| logger.info("Attempting to get job link via 'Share' -> 'Copy link'") | |
| # Use precise selectors based on user's inspection | |
| share_btn = self.wait.until(EC.presence_of_element_located( | |
| (By.CSS_SELECTOR, ".job-details-jobs-unified-top-card__top-buttons .social-share__dropdown-trigger, .jobs-unified-top-card__top-buttons .social-share__dropdown-trigger") | |
| )) | |
| self.driver.execute_script("arguments[0].click();", share_btn) | |
| time.sleep(1) | |
| copy_link_btn = self.wait.until(EC.presence_of_element_located( | |
| (By.CSS_SELECTOR, ".social-share__item--copy-link") | |
| )) | |
| self.driver.execute_script("arguments[0].click();", copy_link_btn) | |
| time.sleep(1) | |
| # Extract from OS clipboard using a temporary textarea | |
| self.driver.execute_script(""" | |
| var input = document.createElement('textarea'); | |
| input.id = 'dummy-clipboard-123'; | |
| document.body.appendChild(input); | |
| input.focus(); | |
| """) | |
| dummy_input = self.driver.find_element(By.ID, 'dummy-clipboard-123') | |
| dummy_input.send_keys(Keys.CONTROL, 'v') | |
| job_link = dummy_input.get_attribute('value') | |
| self.driver.execute_script("document.getElementById('dummy-clipboard-123').remove();") | |
| if not job_link or "linkedin.com" not in job_link: | |
| raise Exception("Clipboard extraction was empty or invalid") | |
| logger.info(f"Successfully got job link from share button: {job_link}") | |
| except Exception as e: | |
| logger.warning(f"Failed to copy link via share button: {e}. Falling back to URL parsing.") | |
| import urllib.parse | |
| parsed_url = urllib.parse.urlparse(self.driver.current_url) | |
| query_params = urllib.parse.parse_qs(parsed_url.query) | |
| if 'currentJobId' in query_params: | |
| job_id = query_params['currentJobId'][0] | |
| job_link = f"https://www.linkedin.com/jobs/view/{job_id}/" | |
| logger.info(f"Successfully got job link from URL parsing: {job_link}") | |
| else: | |
| job_link = self.driver.current_url.split('?')[0] | |
| # β TITLE (Specific to the job detail pane) | |
| try: | |
| title_elem = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".job-details-jobs-unified-top-card__job-title, .jobs-unified-top-card__job-title"))) | |
| title = title_elem.text.strip() | |
| except: | |
| try: | |
| title = self.driver.find_element(By.CSS_SELECTOR, "h2.t-24").text.strip() | |
| except: | |
| title = "N/A" | |
| # β COMPANY | |
| try: | |
| company_elem = self.driver.find_element(By.CSS_SELECTOR, ".job-details-jobs-unified-top-card__company-name, .jobs-unified-top-card__company-name, .job-details-jobs-unified-top-card__primary-description-container a") | |
| company = company_elem.text.strip() | |
| except: | |
| company = "N/A" | |
| # β IMAGE LINK | |
| try: | |
| img_elem = job.find_element(By.CSS_SELECTOR, "img.ivm-view-attr__img--centered, .ivm-view-attr__img-wrapper img, .job-card-container__company-logo img") | |
| img_link = img_elem.get_attribute("src") | |
| except: | |
| try: | |
| img_elem = self.driver.find_element(By.CSS_SELECTOR, ".job-details-jobs-unified-top-card__company-logo img, .jobs-unified-top-card__company-logo img") | |
| img_link = img_elem.get_attribute("src") | |
| except: | |
| img_link = "N/A" | |
| # β DESCRIPTION | |
| try: | |
| description_elem = self.driver.find_element(By.ID, "job-details") | |
| description = description_elem.text.strip() | |
| except: | |
| try: | |
| description = self.driver.find_element(By.CSS_SELECTOR, ".jobs-description__content, .jobs-box__html-content").text.strip() | |
| except: | |
| description = "N/A" | |
| # β APPLY LINK | |
| apply_link = "LinkedIn (No direct button found)" | |
| try: | |
| # 1. Check for Easy Apply | |
| try: | |
| self.driver.find_element(By.CSS_SELECTOR, "button.jobs-apply-button") | |
| apply_link = job_link | |
| except: | |
| # 2. Check for External Apply | |
| external_btn = self.driver.find_element(By.CSS_SELECTOR, "a.jobs-apply-button") | |
| original_window = self.driver.current_window_handle | |
| external_btn.click() | |
| time.sleep(3) | |
| if len(self.driver.window_handles) > 1: | |
| for window_handle in self.driver.window_handles: | |
| if window_handle != original_window: | |
| self.driver.switch_to.window(window_handle) | |
| apply_link = self.driver.current_url | |
| self.driver.close() | |
| self.driver.switch_to.window(original_window) | |
| break | |
| else: | |
| apply_link = external_btn.get_attribute("href") | |
| except: | |
| pass | |
| logger.info(f"Scraped {idx+1}: {title} | {company}") | |
| writer.writerow([title, company, job_link, apply_link, description, img_link]) | |
| f.flush() | |
| except Exception as e: | |
| logger.error(f"Error scraping job at index {idx}: {e}") | |
| logger.info(f"All jobs saved to {self.job_fetcher_config.saved_jobs_file_path}") | |
| self.driver.quit() | |
| return self.job_fetcher_config.saved_jobs_file_path | |
| async def fetch(self, jobtile: str) -> JobFetcherArtifact: | |
| logger.info(f"Starting fetch pipeline for job title: {jobtile}") | |
| saved_cookie_file_path = self.job_fetcher_config.saved_cookie_path | |
| if not os.path.exists(saved_cookie_file_path): | |
| logger.info("Cookies not found. Initiating LinkedIn login.") | |
| self.driver.get("https://www.linkedin.com/login") | |
| username = self.wait.until( | |
| EC.presence_of_element_located((By.ID, "username")) | |
| ) | |
| password = self.driver.find_element(By.ID, "password") | |
| username.send_keys(LINKED_IN_USER_NAME) | |
| password.send_keys(LINKED_IN_USER_PASSWORD) | |
| password.send_keys(Keys.RETURN) | |
| time.sleep(5) | |
| os.makedirs(os.path.dirname(saved_cookie_file_path), exist_ok=True) | |
| pickle.dump(self.driver.get_cookies(), open(saved_cookie_file_path, "wb")) | |
| logger.info(f"Cookies saved to {saved_cookie_file_path}") | |
| logger.info("Loading cookies from file") | |
| cookies = pickle.load(open(saved_cookie_file_path, "rb")) | |
| saved_job_cs = await self.get_jobs(cookies=cookies, jobtile=jobtile) | |
| return JobFetcherArtifact(saved_jobs_file_path=saved_job_cs) | |