JOBScraper / app.py
Rohit-Katkar2003's picture
Update app.py
5ec0cca verified
import gradio as gr
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import random
from urllib.parse import quote_plus
import pandas as pd
import requests
from bs4 import BeautifulSoup
import shutil # Needed to find the binary
# --- LinkedIn Scraper Functions (Keep as is, just ensure they are defined) ---
def linkedin_job_search_engine(field, location=None, date_posted=None, experience_level=None):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}
base_url = f"https://www.linkedin.com/jobs/search/?keywords={quote_plus(field)}"
if location:
base_url += f"&location={quote_plus(location)}"
if date_posted:
date_filters = {
"Any Time": "",
"Recent (Last 24 hours)": "r86400",
"7 Hour ago" : 'r28800' ,
"12 hour ago" : 'r43200',
"Past Week": "r604800",
"Past Month": "r2592000"
}
filter_value = date_filters.get(date_posted, "")
if filter_value:
base_url += f"&f_TPR={filter_value}"
if experience_level is not None:
exp_level_map = {
0: "f_E=1",
1: "f_E=2",
2: "f_E=3",
3: "f_E=4",
4: "f_E=5",
5: "f_E=6",
6: "f_E=7",
7: "f_E=8",
8: "f_E=9",
9: "f_E=10"
}
base_url += f"&{exp_level_map.get(experience_level, '')}"
response = requests.get(base_url, headers=headers)
if response.status_code != 200:
return f"Failed to fetch jobs. Status code: {response.status_code}", []
soup = BeautifulSoup(response.text, 'html.parser')
job_cards = soup.find_all('div', class_='base-card')
jobs = []
for job in job_cards:
title_elem = job.find('span', class_='sr-only')
job_title = title_elem.text.strip() if title_elem else 'N/A'
company_elem = job.find('h4', class_='base-search-card__subtitle')
company_name = company_elem.text.strip() if company_elem else 'N/A'
location_elem = job.find('span', class_='job-search-card__location')
job_location = location_elem.text.strip() if location_elem else 'N/A'
job_link_elem = job.find('a', class_='base-card__full-link')
job_link = job_link_elem['href'] if job_link_elem else '#'
easy_apply_elem = job.find('span', class_='easy-apply-label')
if easy_apply_elem:
continue # Skip Easy Apply jobs as per your original logic
jobs.append({
'Title': job_title,
'Company': company_name,
'Location': job_location,
'Job Link': job_link
})
return f"Found {len(jobs)} jobs", jobs
def format_results(job_title, location, date_posted, experience_level):
# Convert experience_level string from dropdown to integer index if needed
# Or pass it directly if the function handles strings
try:
exp_level_int = int(experience_level) if experience_level and experience_level.isdigit() else 0
except ValueError:
exp_level_int = 0 # Default or handle error
message, jobs = linkedin_job_search_engine(job_title, location, date_posted, exp_level_int)
if not jobs:
return message, "No jobs found 😢"
# Create table header
table_md = """
| 📌 Title | 🏢 Company | 📍 Location | 🔗 Apply |
|---|---|---|---|
"""
# Add rows with links opening in new tab
for job in jobs:
title = job['Title']
company = job['Company']
loc = job['Location']
link = job['Job Link']
# Ensure link is absolute or handle relative links if necessary
apply_button = f'<a href="{link}" target="_blank" rel="noopener noreferrer">👉 Apply Now</a>'
table_md += f"| 💼 {title} | 🏢 {company} | 📍 {loc} | {apply_button} |\n"
return message, table_md
# --- All Jobs Scraper Functions (Keep as is) ---
def get_search_urls(search_url, num_results=20, is_query=True, headless=True):
"""Your existing Selenium function - modified for Hugging Face Spaces"""
options = Options()
if headless:
# Ensure headless is set correctly for newer Chrome versions
# --headless=new is generally preferred
options.add_argument("--headless=new")
# --- Standard Options ---
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage") # Important for containerized envs
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--disable-extensions")
options.add_argument("--disable-plugins-discovery")
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
# options.add_argument("--disable-features=VizDisplayCompositor") # Can sometimes cause issues, try disabling
options.add_argument("--window-size=1920,1080")
# --- Crucial for Hugging Face Spaces: Set Binary Location Explicitly ---
# Try common paths or use shutil.which
chrome_executable = (
shutil.which("google-chrome") or
shutil.which("chromium-browser") or
shutil.which("chromium") or
"/usr/bin/google-chrome" # Fallback common path
# Add more potential paths if needed based on your space logs
)
if chrome_executable:
print(f"Setting Chrome binary location to: {chrome_executable}")
options.binary_location = chrome_executable
else:
print("Warning: Could not find Chrome/Chromium executable. Proceeding with default (might fail).")
# If not found, uc.Chrome might try its default, but explicit is better.
# --- User Agent ---
# Ensure this UA matches the *actual* Chrome version available on Hugging Face
# You might need to adjust this. Check Hugging Face docs or logs for Chrome version.
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")
# --- Disable Automation Indicators ---
options.add_argument("--disable-automation")
options.add_argument("--disable-infobars")
# Exclude the `enable-automation` switch itself
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = None # Initialize driver variable
try:
# --- Initialize undetected_chromedriver ---
# Pass the options object
# Specifying version_main might help, but often letting it auto-detect is better.
# If you know the exact Chrome version on Hugging Face, use it.
# version_main = 119 # Example, adjust if needed or remove
driver = uc.Chrome(options=options) # Removed version_main for now
# --- Execute Script to Remove Webdriver Flag ---
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
# --- Perform Search or Navigation ---
if is_query:
print("Navigating to Google...")
# Ensure clean URL
driver.get("https://www.google.com")
# Add random delay to mimic human behavior
time.sleep(random.uniform(2, 4))
print(f"Searching for: {search_url}")
search_box = driver.find_element(By.NAME, "q")
search_box.clear()
search_box.send_keys(search_url)
search_box.submit()
else:
print(f"Navigating to: {search_url}")
driver.get(search_url)
# Wait for page to load
time.sleep(random.uniform(3, 5))
# --- Check for Blocking ---
page_source = driver.page_source.lower()
if "blocked" in page_source or "captcha" in page_source or "unusual traffic" in page_source or "sorry" in page_source: # Add "sorry"
print("⚠️ Detected potential blocking (CAPTCHA, 'Sorry' page, etc.). Page might not have loaded correctly.")
# Consider returning an empty list or raising an exception here
# return [] # Or handle as appropriate
urls = []
# --- Extract URLs ---
# Try multiple selectors as Google changes them frequently
selectors_to_try = [
"h3 a", # Direct link within h3
".LC20lb.DKV0Md", # More specific Google result title class
".g a[href^='http']", # Link within result div starting with http
".yuRUbf a", # Another common Google class
".tF2Cxc a" # Another common structure
# Add more if needed, inspect the HTML in Spaces if this fails
]
results = []
successful_selector = None
for selector in selectors_to_try:
try:
temp_results = driver.find_elements(By.CSS_SELECTOR, selector)
if temp_results:
results = temp_results
successful_selector = selector
print(f"✓ Found {len(results)} potential results with selector: '{selector}'")
break
except Exception as e:
print(f"✗ Selector '{selector}' failed during find_elements: {str(e)[:100]}...")
continue
if not results:
print("❌ No results found with any selector. Printing page info for debugging...")
print(f"Page title: '{driver.title}'")
print(f"Current URL: {driver.current_url}")
# Limiting source print length for logs
# print(f"Page source snippet: {driver.page_source[:2000]}...")
# Consider saving source for detailed debugging if needed locally
# with open("debug_page_spaces.html", "w", encoding="utf-8") as f:
# f.write(driver.page_source)
# print("📄 Debug page source saved (if file system allows).")
return []
# --- Process Results ---
for i, result in enumerate(results):
try:
# Get the href directly from the element found by the selector
url = result.get_attribute("href")
# Validate and clean URL
if url and url.startswith("http") and "google.com" not in url and "youtube.com" not in url:
# Remove Google redirect if present (more robust check)
from urllib.parse import urlparse, parse_qs
parsed_url = urlparse(url)
if 'url' in parsed_url.path: # Check path for /url
query_params = parse_qs(parsed_url.query)
if 'q' in query_params:
url = query_params['q'][0]
if url not in urls:
urls.append(url)
print(f"{len(urls)}. {url}")
if len(urls) >= num_results:
break
except Exception as e:
print(f"❌ Error processing result {i}: {str(e)[:100]}...")
continue
print(f"✓ Successfully extracted {len(urls)} URLs")
return urls
except Exception as e:
print(f"❌ Critical error during driver execution: {e}")
import traceback
traceback.print_exc() # Print full traceback for debugging
return []
finally:
# --- Ensure Driver Quits ---
if driver:
try:
driver.quit()
print("Driver quit successfully.")
except Exception as e:
print(f"Error quitting driver: {e}") # Log error but don't crash
else:
print("Driver was not initialized, nothing to quit.")
def search_job(portal, job_title, job_type, location, posting, experience_level=""):
"""Enhanced job search function with experience levels"""
# Add experience level to search query if provided
experience_query = ""
if experience_level and experience_level != "Any":
# More specific queries might be needed depending on how sites filter
if experience_level == "Entry Level":
experience_query = "+entry+level+junior+fresher"
elif experience_level == "Mid Level":
experience_query = "+mid+level+2-5+years"
elif experience_level == "Senior Level":
experience_query = "+senior+lead+5++years"
elif experience_level == "Executive":
experience_query = "+director+manager+executive+head"
job_portal_with_link = {
'indeed': f'https://www.google.com/search?q={quote_plus(job_title)}+site:indeed.com+{quote_plus(job_type)}+{quote_plus(location)}{experience_query}&tbs=qdr:{quote_plus(posting)}',
'greenhouse': f'https://www.google.com/search?q={quote_plus(job_title)}+site:greenhouse.io+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'lever': f'https://www.google.com/search?q={quote_plus(job_type)}+site:lever.co+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'ashby': f'https://www.google.com/search?q={quote_plus(job_title)}+site:ashbyhq.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'pinpoint': f'https://www.google.com/search?q={quote_plus(job_title)}+site:pinpointhq.com+{quote_plus(job_type)}+{quote_plus(location)}{experience_query}&tbs=qdr:{quote_plus(posting)}',
'job_subdomain': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.*+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'careers_page': f'https://www.google.com/search?q={quote_plus(job_title)}+(site%3Acareers.*%20OR%20site%3A*%2Fcareers%2F*%20OR%20site%3A*%2Fcareer%2F*)+{quote_plus(job_type)}+{quote_plus(location)}{quote_plus(experience_query)}&tbs=qdr:{quote_plus(posting)}',
'talent_subdomain': f'https://www.google.com/search?q={quote_plus(job_title)}+site:talent.*+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'paylocity': f'https://www.google.com/search?q={quote_plus(job_title)}+site:recruiting.paylocity.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'keka': f'https://www.google.com/search?q={quote_plus(job_title)}+site:keka.com+{quote_plus(job_type)}+{quote_plus(location)}{experience_query}&tbs=qdr:{quote_plus(posting)}',
'workable': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.workable.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'breezyHR': f'https://www.google.com/search?q={quote_plus(job_title)}+site:breezy.hr+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'wellfound': f'https://www.google.com/search?q={quote_plus(job_title)}+site:wellfound.com+{quote_plus(job_type)}+{quote_plus(location)}&tbs=qdr:{quote_plus(posting)}',
'y_combinator': f'https://www.google.com/search?q={quote_plus(job_title)}+site:workatastartup.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'oracle_cloud': f'https://www.google.com/search?q={quote_plus(job_title)}+site:oraclecloud.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'workday': f'https://www.google.com/search?q={quote_plus(job_title)}+site:myworkdayjobs.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'recruitee': f'https://www.google.com/search?q={quote_plus(job_title)}+site:recruitee.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'rippling': f'https://www.google.com/search?q={quote_plus(job_title)}+(site%3Arippling.com%20OR%20site%3Arippling-ats.com)+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'gusto': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.gusto.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'teamtailor': f'https://www.google.com/search?q={quote_plus(job_title)}+site:teamtailor.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'smartrecruiters': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.smartrecruiters.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'builtin': f'https://www.google.com/search?q={quote_plus(job_title)}+site:builtin.com/job/+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'glassdoor': f'https://www.google.com/search?q={quote_plus(job_title)}+site:glassdoor.com/job-listing/+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
'all_jobs': f'https://www.google.com/search?q={quote_plus(job_title)}+(site%3A*%2Femployment%2F*%20OR%20site%3A*%2Fopportunities%2F*%20OR%20site%3A*%2Fopenings%2F*%20OR%20site%3A*%2Fjoin-us%2F*%20OR%20site%3A*%2Fwork-with-us%2F*)+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}'
}
return job_portal_with_link.get(portal, "")
def search_jobs_interface(job_title, job_type, location, posting, experience_level, selected_portals, num_results):
"""Main function to handle the Gradio interface"""
if not job_title.strip():
return "❌ Please enter a job title", None, "<p style='color:red;'>❌ Please enter a job title</p>"
if not selected_portals:
return "❌ Please select at least one job portal", None, "<p style='color:red;'>❌ Please select at least one job portal</p>"
all_results = []
progress_messages = []
for portal in selected_portals:
try:
progress_messages.append(f"🔍 Searching {portal}...")
# Get search URL for the portal
search_url = search_job(portal, job_title, job_type, location, posting, experience_level)
if search_url:
# Use the selenium function to get job URLs
urls = get_search_urls(search_url, num_results=num_results, is_query=False, headless=True)
for url in urls:
all_results.append({
'Portal': portal.title(),
'Job Title': job_title,
'Location': location,
'Job Type': job_type,
'Experience Level': experience_level,
'URL': url # Keep raw URL for DataFrame if needed
})
progress_messages.append(f"✅ Found {len(urls)} jobs on {portal}")
else:
progress_messages.append(f"❌ Invalid portal: {portal}")
except Exception as e:
progress_messages.append(f"❌ Error searching {portal}: {str(e)}")
# Create progress summary
progress_summary = "\n".join(progress_messages)
progress_summary += f"\n\n📊 Total Results: {len(all_results)} jobs found"
# Generate HTML table with clickable links
if all_results:
df = pd.DataFrame(all_results)
# Create HTML table string
html_table = "<table border='1' class='dataframe' style='width:100%; border-collapse: collapse;'>"
html_table += "<thead><tr style='background-color: #f2f2f2;'>"
for col in df.columns:
html_table += f"<th style='padding: 8px; text-align: left; border: 1px solid #ddd;'>{col}</th>"
html_table += "</tr></thead><tbody>"
for _, row in df.iterrows():
html_table += "<tr>"
for col in df.columns:
cell_value = row[col]
if col == 'URL':
# Make URL clickable
html_table += f"<td style='padding: 8px; border: 1px solid #ddd;'><a href='{cell_value}' target='_blank' style='color: #1f77b4;' rel='noopener noreferrer'>Apply Now</a></td>"
else:
html_table += f"<td style='padding: 8px; border: 1px solid #ddd;'>{cell_value}</td>"
html_table += "</tr>"
html_table += "</tbody></table>"
return progress_summary, df, html_table # Return DataFrame and HTML
else:
no_results_html = "<p style='color:red;'>❌ No jobs found. Try different search parameters.</p>"
return progress_summary + "\n\n❌ No jobs found. Try different search parameters.", None, no_results_html
# --- Define the Gradio interface with Tabs ---
def create_gradio_interface():
# Available job portals for All Jobs Scraper
job_portals = [
'indeed', 'greenhouse', 'lever', 'ashby', 'pinpoint',
'job_subdomain', 'careers_page', 'talent_subdomain',
'paylocity', 'keka', 'workable', 'breezyHR', 'wellfound',
'y_combinator', 'oracle_cloud', 'workday', 'recruitee',
'rippling', 'gusto', 'teamtailor', 'smartrecruiters',
'builtin', 'glassdoor', 'all_jobs'
]
with gr.Blocks(title="AI Job Search Engine", theme=gr.themes.Soft()) as app:
gr.Markdown("# 🚀 AI-Powered Job Search Engine")
with gr.Tabs():
# --- Tab 1: LinkedIn Jobs Scraper ---
with gr.TabItem("LinkedIn Jobs"):
gr.Markdown("## 🔍 Search Jobs on LinkedIn")
with gr.Row():
with gr.Column(scale=2):
# LinkedIn search parameters
linkedin_job_title = gr.Textbox(
label="💼 Job Title",
placeholder="e.g., AI ML Engineer, Data Scientist",
value="AI ML Engineer"
)
with gr.Row():
linkedin_location = gr.Textbox(
label="📍 Location",
placeholder="e.g., Pune, Mumbai, Bangalore",
value="Pune"
)
linkedin_date_posted = gr.Dropdown(
label="📅 Posted Within",
choices=["Any Time", "Recent (Last 24 hours)","7 Hour ago","12 hour ago", "Past Week", "Past Month"],
value="Past Week"
)
with gr.Row():
# LinkedIn uses 0-9 for experience levels
linkedin_experience_level = gr.Dropdown(
label="⭐ Experience Level (Years)",
choices=[str(i) for i in range(10)], # 0 to 9
value="0" # Default to Entry Level (0)
)
# Placeholder for future inputs if needed
dummy = gr.Textbox(visible=False) # Or remove this row if not needed
linkedin_search_btn = gr.Button("🔍 Search LinkedIn Jobs", variant="primary")
with gr.Column(scale=3):
# LinkedIn Results section
linkedin_result_msg = gr.Textbox(
label="📈 Message",
lines=2,
max_lines=5,
interactive=False
)
linkedin_result_display = gr.Markdown(
label="📋 Job Listings"
)
# Connect LinkedIn search function
linkedin_search_btn.click(
fn=format_results,
inputs=[linkedin_job_title, linkedin_location, linkedin_date_posted, linkedin_experience_level],
outputs=[linkedin_result_msg, linkedin_result_display]
)
# --- Tab 2: All Jobs Scraper (Google-based) ---
with gr.TabItem("All Jobs (Google Search)"):
gr.Markdown("## 🌐 Search Jobs across the Web (via Google)")
with gr.Row():
with gr.Column(scale=2):
# Job search parameters (your original ones)
job_title = gr.Textbox(
label="💼 Job Title",
placeholder="e.g., AI ML Engineer, Data Scientist, Software Developer",
value="AI ML"
)
with gr.Row():
job_type = gr.Dropdown(
label="🏢 Job Type",
choices=["remote", "on-site", "hybrid", "any"],
value="remote"
)
location = gr.Textbox(
label="📍 Location",
placeholder="e.g., Pune, Mumbai, Bangalore",
value="pune"
)
with gr.Row():
posting = gr.Dropdown(
label="📅 Posted Within",
choices=[('4 hour ago','h4'),('8 hour ago','h8'),('12 hour ago','h12'),("Last 24 hours", "d"),('2 days ago','h48'),('3 days ago' , 'h72'), ("Last week", "w"), ("Last month", "m"), ("Any time", "")],
value="d"
)
experience_level = gr.Dropdown(
label="⭐ Experience Level",
choices=["Any", "Entry Level", "Mid Level", "Senior Level", "Executive"],
value="Any"
)
# Job portals selection
selected_portals = gr.CheckboxGroup(
label="🌐 Select Job Portals",
choices=job_portals,
value=['indeed', 'greenhouse', 'lever', 'builtin', 'glassdoor',
'job_subdomain', 'careers_page', 'talent_subdomain',
'paylocity', 'keka', 'workable', 'breezyHR', 'wellfound',
'y_combinator', 'oracle_cloud', 'workday', 'recruitee',
'rippling', 'gusto', 'teamtailor', 'smartrecruiters','all_jobs']
)
num_results = gr.Slider(
label="📊 Results per Portal",
minimum=1,
maximum=30,
value=20,
step=1
)
# Search button
search_btn = gr.Button("🔍 Search Jobs", variant="primary", size="lg")
with gr.Column(scale=3):
# Results section (your original ones)
progress_output = gr.Textbox(
label="📈 Search Progress",
lines=10,
max_lines=15,
interactive=False
)
# HTML component for clickable links
html_output = gr.HTML(
label="📋 Clickable Job Results"
)
# Connect the search function (your original connection)
search_btn.click(
fn=search_jobs_interface,
inputs=[job_title, job_type, location, posting, experience_level, selected_portals, num_results],
outputs=[progress_output, gr.Dataframe(visible=False), html_output]
)
return app
# Launch the application
if __name__ == "__main__":
app = create_gradio_interface()
app.launch()