Spaces:

Rohit-Katkar2003
/

JOBScraper

Sleeping

App Files Files Community

JOBScraper / app.py

Rohit-Katkar2003

Update app.py

5ec0cca verified 5 months ago

raw

history blame contribute delete

27.8 kB

	import gradio as gr
	import undetected_chromedriver as uc
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.options import Options
	import time
	import random
	from urllib.parse import quote_plus
	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	import shutil # Needed to find the binary
	# --- LinkedIn Scraper Functions (Keep as is, just ensure they are defined) ---

	def linkedin_job_search_engine(field, location=None, date_posted=None, experience_level=None):
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
	}

	base_url = f"https://www.linkedin.com/jobs/search/?keywords={quote_plus(field)}"

	if location:
	base_url += f"&location={quote_plus(location)}"

	if date_posted:
	date_filters = {
	"Any Time": "",
	"Recent (Last 24 hours)": "r86400",
	"7 Hour ago" : 'r28800' ,
	"12 hour ago" : 'r43200',
	"Past Week": "r604800",
	"Past Month": "r2592000"

	}
	filter_value = date_filters.get(date_posted, "")
	if filter_value:
	base_url += f"&f_TPR={filter_value}"

	if experience_level is not None:
	exp_level_map = {
	0: "f_E=1",
	1: "f_E=2",
	2: "f_E=3",
	3: "f_E=4",
	4: "f_E=5",
	5: "f_E=6",
	6: "f_E=7",
	7: "f_E=8",
	8: "f_E=9",
	9: "f_E=10"
	}
	base_url += f"&{exp_level_map.get(experience_level, '')}"

	response = requests.get(base_url, headers=headers)
	if response.status_code != 200:
	return f"Failed to fetch jobs. Status code: {response.status_code}", []

	soup = BeautifulSoup(response.text, 'html.parser')
	job_cards = soup.find_all('div', class_='base-card')

	jobs = []
	for job in job_cards:
	title_elem = job.find('span', class_='sr-only')
	job_title = title_elem.text.strip() if title_elem else 'N/A'

	company_elem = job.find('h4', class_='base-search-card__subtitle')
	company_name = company_elem.text.strip() if company_elem else 'N/A'

	location_elem = job.find('span', class_='job-search-card__location')
	job_location = location_elem.text.strip() if location_elem else 'N/A'

	job_link_elem = job.find('a', class_='base-card__full-link')
	job_link = job_link_elem['href'] if job_link_elem else '#'

	easy_apply_elem = job.find('span', class_='easy-apply-label')
	if easy_apply_elem:
	continue # Skip Easy Apply jobs as per your original logic

	jobs.append({
	'Title': job_title,
	'Company': company_name,
	'Location': job_location,
	'Job Link': job_link
	})

	return f"Found {len(jobs)} jobs", jobs

	def format_results(job_title, location, date_posted, experience_level):
	# Convert experience_level string from dropdown to integer index if needed
	# Or pass it directly if the function handles strings
	try:
	exp_level_int = int(experience_level) if experience_level and experience_level.isdigit() else 0
	except ValueError:
	exp_level_int = 0 # Default or handle error

	message, jobs = linkedin_job_search_engine(job_title, location, date_posted, exp_level_int)
	if not jobs:
	return message, "No jobs found 😢"

	# Create table header
	table_md = """
	\| 📌 Title \| 🏢 Company \| 📍 Location \| 🔗 Apply \|
	\|---\|---\|---\|---\|
	"""
	# Add rows with links opening in new tab
	for job in jobs:
	title = job['Title']
	company = job['Company']
	loc = job['Location']
	link = job['Job Link']
	# Ensure link is absolute or handle relative links if necessary
	apply_button = f'<a href="{link}" target="_blank" rel="noopener noreferrer">👉 Apply Now</a>'
	table_md += f"\| 💼 {title} \| 🏢 {company} \| 📍 {loc} \| {apply_button} \|\n"

	return message, table_md

	# --- All Jobs Scraper Functions (Keep as is) ---

	def get_search_urls(search_url, num_results=20, is_query=True, headless=True):
	"""Your existing Selenium function - modified for Hugging Face Spaces"""
	options = Options()

	if headless:
	# Ensure headless is set correctly for newer Chrome versions
	# --headless=new is generally preferred
	options.add_argument("--headless=new")

	# --- Standard Options ---
	options.add_argument("--no-sandbox")
	options.add_argument("--disable-dev-shm-usage") # Important for containerized envs
	options.add_argument("--disable-blink-features=AutomationControlled")
	options.add_argument("--disable-extensions")
	options.add_argument("--disable-plugins-discovery")
	options.add_argument("--disable-web-security")
	options.add_argument("--allow-running-insecure-content")
	# options.add_argument("--disable-features=VizDisplayCompositor") # Can sometimes cause issues, try disabling
	options.add_argument("--window-size=1920,1080")

	# --- Crucial for Hugging Face Spaces: Set Binary Location Explicitly ---
	# Try common paths or use shutil.which
	chrome_executable = (
	shutil.which("google-chrome") or
	shutil.which("chromium-browser") or
	shutil.which("chromium") or
	"/usr/bin/google-chrome" # Fallback common path
	# Add more potential paths if needed based on your space logs
	)

	if chrome_executable:
	print(f"Setting Chrome binary location to: {chrome_executable}")
	options.binary_location = chrome_executable
	else:
	print("Warning: Could not find Chrome/Chromium executable. Proceeding with default (might fail).")
	# If not found, uc.Chrome might try its default, but explicit is better.

	# --- User Agent ---
	# Ensure this UA matches the actual Chrome version available on Hugging Face
	# You might need to adjust this. Check Hugging Face docs or logs for Chrome version.
	options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")

	# --- Disable Automation Indicators ---
	options.add_argument("--disable-automation")
	options.add_argument("--disable-infobars")
	# Exclude the `enable-automation` switch itself
	options.add_experimental_option("excludeSwitches", ["enable-automation"])
	options.add_experimental_option('useAutomationExtension', False)


	driver = None # Initialize driver variable
	try:
	# --- Initialize undetected_chromedriver ---
	# Pass the options object
	# Specifying version_main might help, but often letting it auto-detect is better.
	# If you know the exact Chrome version on Hugging Face, use it.
	# version_main = 119 # Example, adjust if needed or remove
	driver = uc.Chrome(options=options) # Removed version_main for now

	# --- Execute Script to Remove Webdriver Flag ---
	driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

	# --- Perform Search or Navigation ---
	if is_query:
	print("Navigating to Google...")
	# Ensure clean URL
	driver.get("https://www.google.com")

	# Add random delay to mimic human behavior
	time.sleep(random.uniform(2, 4))

	print(f"Searching for: {search_url}")
	search_box = driver.find_element(By.NAME, "q")
	search_box.clear()
	search_box.send_keys(search_url)
	search_box.submit()
	else:
	print(f"Navigating to: {search_url}")
	driver.get(search_url)

	# Wait for page to load
	time.sleep(random.uniform(3, 5))

	# --- Check for Blocking ---
	page_source = driver.page_source.lower()
	if "blocked" in page_source or "captcha" in page_source or "unusual traffic" in page_source or "sorry" in page_source: # Add "sorry"
	print("⚠️ Detected potential blocking (CAPTCHA, 'Sorry' page, etc.). Page might not have loaded correctly.")
	# Consider returning an empty list or raising an exception here
	# return [] # Or handle as appropriate

	urls = []

	# --- Extract URLs ---
	# Try multiple selectors as Google changes them frequently
	selectors_to_try = [
	"h3 a", # Direct link within h3
	".LC20lb.DKV0Md", # More specific Google result title class
	".g a[href^='http']", # Link within result div starting with http
	".yuRUbf a", # Another common Google class
	".tF2Cxc a" # Another common structure
	# Add more if needed, inspect the HTML in Spaces if this fails
	]

	results = []
	successful_selector = None

	for selector in selectors_to_try:
	try:
	temp_results = driver.find_elements(By.CSS_SELECTOR, selector)
	if temp_results:
	results = temp_results
	successful_selector = selector
	print(f"✓ Found {len(results)} potential results with selector: '{selector}'")
	break
	except Exception as e:
	print(f"✗ Selector '{selector}' failed during find_elements: {str(e)[:100]}...")
	continue

	if not results:
	print("❌ No results found with any selector. Printing page info for debugging...")
	print(f"Page title: '{driver.title}'")
	print(f"Current URL: {driver.current_url}")
	# Limiting source print length for logs
	# print(f"Page source snippet: {driver.page_source[:2000]}...")
	# Consider saving source for detailed debugging if needed locally
	# with open("debug_page_spaces.html", "w", encoding="utf-8") as f:
	# f.write(driver.page_source)
	# print("📄 Debug page source saved (if file system allows).")

	return []

	# --- Process Results ---
	for i, result in enumerate(results):
	try:
	# Get the href directly from the element found by the selector
	url = result.get_attribute("href")

	# Validate and clean URL
	if url and url.startswith("http") and "google.com" not in url and "youtube.com" not in url:
	# Remove Google redirect if present (more robust check)
	from urllib.parse import urlparse, parse_qs
	parsed_url = urlparse(url)
	if 'url' in parsed_url.path: # Check path for /url
	query_params = parse_qs(parsed_url.query)
	if 'q' in query_params:
	url = query_params['q'][0]

	if url not in urls:
	urls.append(url)
	print(f"{len(urls)}. {url}")

	if len(urls) >= num_results:
	break

	except Exception as e:
	print(f"❌ Error processing result {i}: {str(e)[:100]}...")
	continue

	print(f"✓ Successfully extracted {len(urls)} URLs")
	return urls

	except Exception as e:
	print(f"❌ Critical error during driver execution: {e}")
	import traceback
	traceback.print_exc() # Print full traceback for debugging
	return []

	finally:
	# --- Ensure Driver Quits ---
	if driver:
	try:
	driver.quit()
	print("Driver quit successfully.")
	except Exception as e:
	print(f"Error quitting driver: {e}") # Log error but don't crash
	else:
	print("Driver was not initialized, nothing to quit.")

	def search_job(portal, job_title, job_type, location, posting, experience_level=""):
	"""Enhanced job search function with experience levels"""

	# Add experience level to search query if provided
	experience_query = ""
	if experience_level and experience_level != "Any":
	# More specific queries might be needed depending on how sites filter
	if experience_level == "Entry Level":
	experience_query = "+entry+level+junior+fresher"
	elif experience_level == "Mid Level":
	experience_query = "+mid+level+2-5+years"
	elif experience_level == "Senior Level":
	experience_query = "+senior+lead+5++years"
	elif experience_level == "Executive":
	experience_query = "+director+manager+executive+head"

	job_portal_with_link = {
	'indeed': f'https://www.google.com/search?q={quote_plus(job_title)}+site:indeed.com+{quote_plus(job_type)}+{quote_plus(location)}{experience_query}&tbs=qdr:{quote_plus(posting)}',
	'greenhouse': f'https://www.google.com/search?q={quote_plus(job_title)}+site:greenhouse.io+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'lever': f'https://www.google.com/search?q={quote_plus(job_type)}+site:lever.co+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'ashby': f'https://www.google.com/search?q={quote_plus(job_title)}+site:ashbyhq.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'pinpoint': f'https://www.google.com/search?q={quote_plus(job_title)}+site:pinpointhq.com+{quote_plus(job_type)}+{quote_plus(location)}{experience_query}&tbs=qdr:{quote_plus(posting)}',
	'job_subdomain': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.*+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'careers_page': f'https://www.google.com/search?q={quote_plus(job_title)}+(site%3Acareers.%20OR%20site%3A%2Fcareers%2F%20OR%20site%3A%2Fcareer%2F*)+{quote_plus(job_type)}+{quote_plus(location)}{quote_plus(experience_query)}&tbs=qdr:{quote_plus(posting)}',
	'talent_subdomain': f'https://www.google.com/search?q={quote_plus(job_title)}+site:talent.*+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'paylocity': f'https://www.google.com/search?q={quote_plus(job_title)}+site:recruiting.paylocity.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'keka': f'https://www.google.com/search?q={quote_plus(job_title)}+site:keka.com+{quote_plus(job_type)}+{quote_plus(location)}{experience_query}&tbs=qdr:{quote_plus(posting)}',
	'workable': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.workable.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'breezyHR': f'https://www.google.com/search?q={quote_plus(job_title)}+site:breezy.hr+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'wellfound': f'https://www.google.com/search?q={quote_plus(job_title)}+site:wellfound.com+{quote_plus(job_type)}+{quote_plus(location)}&tbs=qdr:{quote_plus(posting)}',
	'y_combinator': f'https://www.google.com/search?q={quote_plus(job_title)}+site:workatastartup.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'oracle_cloud': f'https://www.google.com/search?q={quote_plus(job_title)}+site:oraclecloud.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'workday': f'https://www.google.com/search?q={quote_plus(job_title)}+site:myworkdayjobs.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'recruitee': f'https://www.google.com/search?q={quote_plus(job_title)}+site:recruitee.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'rippling': f'https://www.google.com/search?q={quote_plus(job_title)}+(site%3Arippling.com%20OR%20site%3Arippling-ats.com)+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'gusto': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.gusto.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'teamtailor': f'https://www.google.com/search?q={quote_plus(job_title)}+site:teamtailor.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'smartrecruiters': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.smartrecruiters.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'builtin': f'https://www.google.com/search?q={quote_plus(job_title)}+site:builtin.com/job/+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'glassdoor': f'https://www.google.com/search?q={quote_plus(job_title)}+site:glassdoor.com/job-listing/+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}',
	'all_jobs': f'https://www.google.com/search?q={quote_plus(job_title)}+(site%3A%2Femployment%2F%20OR%20site%3A%2Fopportunities%2F%20OR%20site%3A%2Fopenings%2F%20OR%20site%3A%2Fjoin-us%2F%20OR%20site%3A%2Fwork-with-us%2F)+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}'
	}

	return job_portal_with_link.get(portal, "")

	def search_jobs_interface(job_title, job_type, location, posting, experience_level, selected_portals, num_results):
	"""Main function to handle the Gradio interface"""

	if not job_title.strip():
	return "❌ Please enter a job title", None, "<p style='color:red;'>❌ Please enter a job title</p>"

	if not selected_portals:
	return "❌ Please select at least one job portal", None, "<p style='color:red;'>❌ Please select at least one job portal</p>"

	all_results = []
	progress_messages = []

	for portal in selected_portals:
	try:
	progress_messages.append(f"🔍 Searching {portal}...")

	# Get search URL for the portal
	search_url = search_job(portal, job_title, job_type, location, posting, experience_level)

	if search_url:
	# Use the selenium function to get job URLs
	urls = get_search_urls(search_url, num_results=num_results, is_query=False, headless=True)

	for url in urls:
	all_results.append({
	'Portal': portal.title(),
	'Job Title': job_title,
	'Location': location,
	'Job Type': job_type,
	'Experience Level': experience_level,
	'URL': url # Keep raw URL for DataFrame if needed
	})

	progress_messages.append(f"✅ Found {len(urls)} jobs on {portal}")
	else:
	progress_messages.append(f"❌ Invalid portal: {portal}")

	except Exception as e:
	progress_messages.append(f"❌ Error searching {portal}: {str(e)}")

	# Create progress summary
	progress_summary = "\n".join(progress_messages)
	progress_summary += f"\n\n📊 Total Results: {len(all_results)} jobs found"

	# Generate HTML table with clickable links
	if all_results:
	df = pd.DataFrame(all_results)
	# Create HTML table string
	html_table = "<table border='1' class='dataframe' style='width:100%; border-collapse: collapse;'>"
	html_table += "<thead><tr style='background-color: #f2f2f2;'>"
	for col in df.columns:
	html_table += f"<th style='padding: 8px; text-align: left; border: 1px solid #ddd;'>{col}</th>"
	html_table += "</tr></thead><tbody>"

	for _, row in df.iterrows():
	html_table += "<tr>"
	for col in df.columns:
	cell_value = row[col]
	if col == 'URL':
	# Make URL clickable
	html_table += f"<td style='padding: 8px; border: 1px solid #ddd;'><a href='{cell_value}' target='_blank' style='color: #1f77b4;' rel='noopener noreferrer'>Apply Now</a></td>"
	else:
	html_table += f"<td style='padding: 8px; border: 1px solid #ddd;'>{cell_value}</td>"
	html_table += "</tr>"
	html_table += "</tbody></table>"
	return progress_summary, df, html_table # Return DataFrame and HTML
	else:
	no_results_html = "<p style='color:red;'>❌ No jobs found. Try different search parameters.</p>"
	return progress_summary + "\n\n❌ No jobs found. Try different search parameters.", None, no_results_html


	# --- Define the Gradio interface with Tabs ---

	def create_gradio_interface():
	# Available job portals for All Jobs Scraper
	job_portals = [
	'indeed', 'greenhouse', 'lever', 'ashby', 'pinpoint',
	'job_subdomain', 'careers_page', 'talent_subdomain',
	'paylocity', 'keka', 'workable', 'breezyHR', 'wellfound',
	'y_combinator', 'oracle_cloud', 'workday', 'recruitee',
	'rippling', 'gusto', 'teamtailor', 'smartrecruiters',
	'builtin', 'glassdoor', 'all_jobs'
	]

	with gr.Blocks(title="AI Job Search Engine", theme=gr.themes.Soft()) as app:

	gr.Markdown("# 🚀 AI-Powered Job Search Engine")

	with gr.Tabs():
	# --- Tab 1: LinkedIn Jobs Scraper ---
	with gr.TabItem("LinkedIn Jobs"):
	gr.Markdown("## 🔍 Search Jobs on LinkedIn")
	with gr.Row():
	with gr.Column(scale=2):
	# LinkedIn search parameters
	linkedin_job_title = gr.Textbox(
	label="💼 Job Title",
	placeholder="e.g., AI ML Engineer, Data Scientist",
	value="AI ML Engineer"
	)
	with gr.Row():
	linkedin_location = gr.Textbox(
	label="📍 Location",
	placeholder="e.g., Pune, Mumbai, Bangalore",
	value="Pune"
	)
	linkedin_date_posted = gr.Dropdown(
	label="📅 Posted Within",
	choices=["Any Time", "Recent (Last 24 hours)","7 Hour ago","12 hour ago", "Past Week", "Past Month"],
	value="Past Week"
	)
	with gr.Row():
	# LinkedIn uses 0-9 for experience levels
	linkedin_experience_level = gr.Dropdown(
	label="⭐ Experience Level (Years)",
	choices=[str(i) for i in range(10)], # 0 to 9
	value="0" # Default to Entry Level (0)
	)
	# Placeholder for future inputs if needed
	dummy = gr.Textbox(visible=False) # Or remove this row if not needed

	linkedin_search_btn = gr.Button("🔍 Search LinkedIn Jobs", variant="primary")

	with gr.Column(scale=3):
	# LinkedIn Results section
	linkedin_result_msg = gr.Textbox(
	label="📈 Message",
	lines=2,
	max_lines=5,
	interactive=False
	)
	linkedin_result_display = gr.Markdown(
	label="📋 Job Listings"
	)

	# Connect LinkedIn search function
	linkedin_search_btn.click(
	fn=format_results,
	inputs=[linkedin_job_title, linkedin_location, linkedin_date_posted, linkedin_experience_level],
	outputs=[linkedin_result_msg, linkedin_result_display]
	)

	# --- Tab 2: All Jobs Scraper (Google-based) ---
	with gr.TabItem("All Jobs (Google Search)"):
	gr.Markdown("## 🌐 Search Jobs across the Web (via Google)")
	with gr.Row():
	with gr.Column(scale=2):

	# Job search parameters (your original ones)
	job_title = gr.Textbox(
	label="💼 Job Title",
	placeholder="e.g., AI ML Engineer, Data Scientist, Software Developer",
	value="AI ML"
	)

	with gr.Row():
	job_type = gr.Dropdown(
	label="🏢 Job Type",
	choices=["remote", "on-site", "hybrid", "any"],
	value="remote"
	)

	location = gr.Textbox(
	label="📍 Location",
	placeholder="e.g., Pune, Mumbai, Bangalore",
	value="pune"
	)

	with gr.Row():
	posting = gr.Dropdown(
	label="📅 Posted Within",
	choices=[('4 hour ago','h4'),('8 hour ago','h8'),('12 hour ago','h12'),("Last 24 hours", "d"),('2 days ago','h48'),('3 days ago' , 'h72'), ("Last week", "w"), ("Last month", "m"), ("Any time", "")],
	value="d"
	)

	experience_level = gr.Dropdown(
	label="⭐ Experience Level",
	choices=["Any", "Entry Level", "Mid Level", "Senior Level", "Executive"],
	value="Any"
	)

	# Job portals selection
	selected_portals = gr.CheckboxGroup(
	label="🌐 Select Job Portals",
	choices=job_portals,
	value=['indeed', 'greenhouse', 'lever', 'builtin', 'glassdoor',
	'job_subdomain', 'careers_page', 'talent_subdomain',
	'paylocity', 'keka', 'workable', 'breezyHR', 'wellfound',
	'y_combinator', 'oracle_cloud', 'workday', 'recruitee',
	'rippling', 'gusto', 'teamtailor', 'smartrecruiters','all_jobs']
	)

	num_results = gr.Slider(
	label="📊 Results per Portal",
	minimum=1,
	maximum=30,
	value=20,
	step=1
	)

	# Search button
	search_btn = gr.Button("🔍 Search Jobs", variant="primary", size="lg")

	with gr.Column(scale=3):
	# Results section (your original ones)
	progress_output = gr.Textbox(
	label="📈 Search Progress",
	lines=10,
	max_lines=15,
	interactive=False
	)

	# HTML component for clickable links
	html_output = gr.HTML(
	label="📋 Clickable Job Results"
	)

	# Connect the search function (your original connection)
	search_btn.click(
	fn=search_jobs_interface,
	inputs=[job_title, job_type, location, posting, experience_level, selected_portals, num_results],
	outputs=[progress_output, gr.Dataframe(visible=False), html_output]
	)

	return app

	# Launch the application
	if __name__ == "__main__":
	app = create_gradio_interface()
	app.launch()