Spaces:
Sleeping
Sleeping
| import logging | |
| from os import listdir | |
| from os.path import isfile, join | |
| import easygui | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from pathvalidate import sanitize_filename | |
| from xhtml2pdf import pisa | |
| """ | |
| This script takes a LinkedIn job posting URL | |
| and converts the description to a PDF file. | |
| The PDF file is saved in the Data/JobDescription folder. | |
| The name will be OrgName__Job Title_X.pdf, where X is the number of files in the folder. | |
| IMPORTANT: Make sure the URL is to the actual job description, | |
| and not the job search page. | |
| """ | |
| def linkedin_to_pdf(job_url: str): | |
| job_path = "Data/JobDescription/" | |
| job_description = "" | |
| files_number = len([f for f in listdir(job_path) if isfile(join(job_path, f))]) | |
| try: | |
| page = requests.get(job_url) | |
| if page.status_code != 200: | |
| print( | |
| f"Failed to retrieve the job posting at {job_url}. Status code: {page.status_code}" | |
| ) | |
| return | |
| # Parse the HTML content of the job posting using BeautifulSoup | |
| soup = BeautifulSoup(page.text, "html.parser") | |
| # Find the job title element and get the text | |
| job_title = soup.find("h1", {"class": "topcard__title"}).text.strip() | |
| # Find the organization name element (try both selectors) | |
| organization_element = soup.find("span", {"class": "topcard__flavor"}) | |
| if not organization_element: | |
| organization_element = soup.find("a", {"class": "topcard__org-name-link"}) | |
| # Extract the organization name | |
| organization = organization_element.text.strip() | |
| # Find the job description element | |
| job_description_element = soup.find( | |
| "div", {"class": "show-more-less-html__markup"} | |
| ) | |
| # Extract the job description and concatenate its elements | |
| if job_description_element: | |
| for element in job_description_element.contents: | |
| job_description += str(element) | |
| # Set file_path and sanitize organization name and job title | |
| file_path = f"{job_path}{sanitize_filename(organization + '__' + job_title)}_{files_number}.pdf" | |
| # Create a PDF file and write the job description to it | |
| with open(file_path, "wb") as pdf_file: | |
| pisa.CreatePDF(job_description, dest=pdf_file, encoding="utf-8") | |
| logging.info("PDF saved to " + file_path) | |
| except Exception as e: | |
| logging.error(f"Could not get the description from the URL: {job_url}") | |
| logging.error(e) | |
| exit() | |
| if __name__ == "__main__": | |
| url = easygui.enterbox("Enter the URL of the LinkedIn Job Posting:").strip() | |
| linkedin_to_pdf(url) | |