Spaces:
Paused
Paused
| import pandas as pd | |
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| import time | |
| def LINKEDIN_Scrapping(job_search , num_jobs,driver): | |
| job1 = job_search.split(" ")[0] | |
| job2 = job_search.split(" ")[1] | |
| link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0' | |
| # FIRST get main informations about jobs | |
| title = [] | |
| location = [] | |
| country = [] | |
| company_name = [] | |
| post_time = [] | |
| links =[] | |
| # get the specific numbers of jobs | |
| l1 = "" | |
| ll ="" | |
| driver.get(link1) | |
| SCROLL_PAUSE_TIME = 0.5 | |
| while True : | |
| l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div') | |
| ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a') | |
| if len(l1) >= num_jobs: | |
| break | |
| time.sleep(3) | |
| # Get scroll height | |
| last_height = driver.execute_script("return document.body.scrollHeight") | |
| while True: | |
| # Scroll down to bottom | |
| driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
| # Wait to load page | |
| time.sleep(SCROLL_PAUSE_TIME) | |
| # Calculate new scroll height and compare with last scroll height | |
| new_height = driver.execute_script("return document.body.scrollHeight") | |
| if new_height == last_height: | |
| break | |
| last_height = new_height | |
| options.add_argument("window-size=1200x600") | |
| WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click() | |
| print(len(l1)) | |
| time.sleep(2) | |
| l2 = l1[:num_jobs] | |
| for info in l2: | |
| info_tot = info.text.split("\n") | |
| if len(info_tot)==5: | |
| title.append(info_tot[1]) | |
| location.append(info_tot[3]) | |
| company_name.append(info_tot[2]) | |
| post_time.append(info_tot[4]) | |
| else: | |
| title.append(info_tot[1]) | |
| location.append(info_tot[3]) | |
| company_name.append(info_tot[2]) | |
| post_time.append(info_tot[5]) | |
| # get links for jobs | |
| l3 = ll[:num_jobs] | |
| for i in l3: | |
| links.append(i.get_attribute('href')) | |
| df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time}) | |
| # GET DESCRIPTION AND LOGO | |
| def all_description_LOGO(urls): | |
| description =[] | |
| LOGO =[] | |
| for link in urls: | |
| driver = webdriver.Chrome('chromedriver',options=options) | |
| driver.get(link) | |
| options.add_argument("window-size=1200x600") | |
| WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click() | |
| qqq= 4+444*58/7+65 | |
| K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img') | |
| LOGO.append(K.get_attribute('src')) | |
| time.sleep(3) | |
| t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div') | |
| t_reverse=t.text[::-1] | |
| if t_reverse[:9] =="erom wohs": | |
| l = len(t.text) | |
| strings=t.text[:l-9].split("\n") | |
| strings[:] = [x for x in strings if x] | |
| description.append(strings) | |
| else: | |
| strings=t.text.split("\n") | |
| strings[:] = [x for x in strings if x] | |
| description.append(strings) | |
| df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO}) | |
| return df_ml | |
| # apply desc. and logo function | |
| E = all_description_LOGO(links) | |
| # other info function | |
| def other(urls): | |
| frames =[] | |
| for url in urls: | |
| data1 = requests.get(url) | |
| soup1 = BeautifulSoup(data1.content) | |
| j = soup1.find('ul' , {'class': 'description__job-criteria-list'}) | |
| time.sleep(4) | |
| jj=j.find_all('h3') | |
| dic ={} | |
| for i in range(len(jj)): | |
| dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip() | |
| output = pd.DataFrame() | |
| output = output.append(dic, ignore_index=True) | |
| frames.append(output) | |
| result = pd.concat(frames) | |
| return result | |
| # apply Other function | |
| df = other(links) | |
| df.fillna('Not_Found',inplace= True) | |
| df.reset_index(inplace=True, drop=True) | |
| # combine all together | |
| result = pd.concat([df_ml,E, df ], axis=1) | |
| return result | |