import gradio as gr import undetected_chromedriver as uc from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import time import random from urllib.parse import quote_plus import pandas as pd import requests from bs4 import BeautifulSoup import shutil # Needed to find the binary # --- LinkedIn Scraper Functions (Keep as is, just ensure they are defined) --- def linkedin_job_search_engine(field, location=None, date_posted=None, experience_level=None): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36" } base_url = f"https://www.linkedin.com/jobs/search/?keywords={quote_plus(field)}" if location: base_url += f"&location={quote_plus(location)}" if date_posted: date_filters = { "Any Time": "", "Recent (Last 24 hours)": "r86400", "7 Hour ago" : 'r28800' , "12 hour ago" : 'r43200', "Past Week": "r604800", "Past Month": "r2592000" } filter_value = date_filters.get(date_posted, "") if filter_value: base_url += f"&f_TPR={filter_value}" if experience_level is not None: exp_level_map = { 0: "f_E=1", 1: "f_E=2", 2: "f_E=3", 3: "f_E=4", 4: "f_E=5", 5: "f_E=6", 6: "f_E=7", 7: "f_E=8", 8: "f_E=9", 9: "f_E=10" } base_url += f"&{exp_level_map.get(experience_level, '')}" response = requests.get(base_url, headers=headers) if response.status_code != 200: return f"Failed to fetch jobs. Status code: {response.status_code}", [] soup = BeautifulSoup(response.text, 'html.parser') job_cards = soup.find_all('div', class_='base-card') jobs = [] for job in job_cards: title_elem = job.find('span', class_='sr-only') job_title = title_elem.text.strip() if title_elem else 'N/A' company_elem = job.find('h4', class_='base-search-card__subtitle') company_name = company_elem.text.strip() if company_elem else 'N/A' location_elem = job.find('span', class_='job-search-card__location') job_location = location_elem.text.strip() if location_elem else 'N/A' job_link_elem = job.find('a', class_='base-card__full-link') job_link = job_link_elem['href'] if job_link_elem else '#' easy_apply_elem = job.find('span', class_='easy-apply-label') if easy_apply_elem: continue # Skip Easy Apply jobs as per your original logic jobs.append({ 'Title': job_title, 'Company': company_name, 'Location': job_location, 'Job Link': job_link }) return f"Found {len(jobs)} jobs", jobs def format_results(job_title, location, date_posted, experience_level): # Convert experience_level string from dropdown to integer index if needed # Or pass it directly if the function handles strings try: exp_level_int = int(experience_level) if experience_level and experience_level.isdigit() else 0 except ValueError: exp_level_int = 0 # Default or handle error message, jobs = linkedin_job_search_engine(job_title, location, date_posted, exp_level_int) if not jobs: return message, "No jobs found 😢" # Create table header table_md = """ | šŸ“Œ Title | šŸ¢ Company | šŸ“ Location | šŸ”— Apply | |---|---|---|---| """ # Add rows with links opening in new tab for job in jobs: title = job['Title'] company = job['Company'] loc = job['Location'] link = job['Job Link'] # Ensure link is absolute or handle relative links if necessary apply_button = f'šŸ‘‰ Apply Now' table_md += f"| šŸ’¼ {title} | šŸ¢ {company} | šŸ“ {loc} | {apply_button} |\n" return message, table_md # --- All Jobs Scraper Functions (Keep as is) --- def get_search_urls(search_url, num_results=20, is_query=True, headless=True): """Your existing Selenium function - modified for Hugging Face Spaces""" options = Options() if headless: # Ensure headless is set correctly for newer Chrome versions # --headless=new is generally preferred options.add_argument("--headless=new") # --- Standard Options --- options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") # Important for containerized envs options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("--disable-extensions") options.add_argument("--disable-plugins-discovery") options.add_argument("--disable-web-security") options.add_argument("--allow-running-insecure-content") # options.add_argument("--disable-features=VizDisplayCompositor") # Can sometimes cause issues, try disabling options.add_argument("--window-size=1920,1080") # --- Crucial for Hugging Face Spaces: Set Binary Location Explicitly --- # Try common paths or use shutil.which chrome_executable = ( shutil.which("google-chrome") or shutil.which("chromium-browser") or shutil.which("chromium") or "/usr/bin/google-chrome" # Fallback common path # Add more potential paths if needed based on your space logs ) if chrome_executable: print(f"Setting Chrome binary location to: {chrome_executable}") options.binary_location = chrome_executable else: print("Warning: Could not find Chrome/Chromium executable. Proceeding with default (might fail).") # If not found, uc.Chrome might try its default, but explicit is better. # --- User Agent --- # Ensure this UA matches the *actual* Chrome version available on Hugging Face # You might need to adjust this. Check Hugging Face docs or logs for Chrome version. options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36") # --- Disable Automation Indicators --- options.add_argument("--disable-automation") options.add_argument("--disable-infobars") # Exclude the `enable-automation` switch itself options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option('useAutomationExtension', False) driver = None # Initialize driver variable try: # --- Initialize undetected_chromedriver --- # Pass the options object # Specifying version_main might help, but often letting it auto-detect is better. # If you know the exact Chrome version on Hugging Face, use it. # version_main = 119 # Example, adjust if needed or remove driver = uc.Chrome(options=options) # Removed version_main for now # --- Execute Script to Remove Webdriver Flag --- driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") # --- Perform Search or Navigation --- if is_query: print("Navigating to Google...") # Ensure clean URL driver.get("https://www.google.com") # Add random delay to mimic human behavior time.sleep(random.uniform(2, 4)) print(f"Searching for: {search_url}") search_box = driver.find_element(By.NAME, "q") search_box.clear() search_box.send_keys(search_url) search_box.submit() else: print(f"Navigating to: {search_url}") driver.get(search_url) # Wait for page to load time.sleep(random.uniform(3, 5)) # --- Check for Blocking --- page_source = driver.page_source.lower() if "blocked" in page_source or "captcha" in page_source or "unusual traffic" in page_source or "sorry" in page_source: # Add "sorry" print("āš ļø Detected potential blocking (CAPTCHA, 'Sorry' page, etc.). Page might not have loaded correctly.") # Consider returning an empty list or raising an exception here # return [] # Or handle as appropriate urls = [] # --- Extract URLs --- # Try multiple selectors as Google changes them frequently selectors_to_try = [ "h3 a", # Direct link within h3 ".LC20lb.DKV0Md", # More specific Google result title class ".g a[href^='http']", # Link within result div starting with http ".yuRUbf a", # Another common Google class ".tF2Cxc a" # Another common structure # Add more if needed, inspect the HTML in Spaces if this fails ] results = [] successful_selector = None for selector in selectors_to_try: try: temp_results = driver.find_elements(By.CSS_SELECTOR, selector) if temp_results: results = temp_results successful_selector = selector print(f"āœ“ Found {len(results)} potential results with selector: '{selector}'") break except Exception as e: print(f"āœ— Selector '{selector}' failed during find_elements: {str(e)[:100]}...") continue if not results: print("āŒ No results found with any selector. Printing page info for debugging...") print(f"Page title: '{driver.title}'") print(f"Current URL: {driver.current_url}") # Limiting source print length for logs # print(f"Page source snippet: {driver.page_source[:2000]}...") # Consider saving source for detailed debugging if needed locally # with open("debug_page_spaces.html", "w", encoding="utf-8") as f: # f.write(driver.page_source) # print("šŸ“„ Debug page source saved (if file system allows).") return [] # --- Process Results --- for i, result in enumerate(results): try: # Get the href directly from the element found by the selector url = result.get_attribute("href") # Validate and clean URL if url and url.startswith("http") and "google.com" not in url and "youtube.com" not in url: # Remove Google redirect if present (more robust check) from urllib.parse import urlparse, parse_qs parsed_url = urlparse(url) if 'url' in parsed_url.path: # Check path for /url query_params = parse_qs(parsed_url.query) if 'q' in query_params: url = query_params['q'][0] if url not in urls: urls.append(url) print(f"{len(urls)}. {url}") if len(urls) >= num_results: break except Exception as e: print(f"āŒ Error processing result {i}: {str(e)[:100]}...") continue print(f"āœ“ Successfully extracted {len(urls)} URLs") return urls except Exception as e: print(f"āŒ Critical error during driver execution: {e}") import traceback traceback.print_exc() # Print full traceback for debugging return [] finally: # --- Ensure Driver Quits --- if driver: try: driver.quit() print("Driver quit successfully.") except Exception as e: print(f"Error quitting driver: {e}") # Log error but don't crash else: print("Driver was not initialized, nothing to quit.") def search_job(portal, job_title, job_type, location, posting, experience_level=""): """Enhanced job search function with experience levels""" # Add experience level to search query if provided experience_query = "" if experience_level and experience_level != "Any": # More specific queries might be needed depending on how sites filter if experience_level == "Entry Level": experience_query = "+entry+level+junior+fresher" elif experience_level == "Mid Level": experience_query = "+mid+level+2-5+years" elif experience_level == "Senior Level": experience_query = "+senior+lead+5++years" elif experience_level == "Executive": experience_query = "+director+manager+executive+head" job_portal_with_link = { 'indeed': f'https://www.google.com/search?q={quote_plus(job_title)}+site:indeed.com+{quote_plus(job_type)}+{quote_plus(location)}{experience_query}&tbs=qdr:{quote_plus(posting)}', 'greenhouse': f'https://www.google.com/search?q={quote_plus(job_title)}+site:greenhouse.io+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'lever': f'https://www.google.com/search?q={quote_plus(job_type)}+site:lever.co+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'ashby': f'https://www.google.com/search?q={quote_plus(job_title)}+site:ashbyhq.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'pinpoint': f'https://www.google.com/search?q={quote_plus(job_title)}+site:pinpointhq.com+{quote_plus(job_type)}+{quote_plus(location)}{experience_query}&tbs=qdr:{quote_plus(posting)}', 'job_subdomain': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.*+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'careers_page': f'https://www.google.com/search?q={quote_plus(job_title)}+(site%3Acareers.*%20OR%20site%3A*%2Fcareers%2F*%20OR%20site%3A*%2Fcareer%2F*)+{quote_plus(job_type)}+{quote_plus(location)}{quote_plus(experience_query)}&tbs=qdr:{quote_plus(posting)}', 'talent_subdomain': f'https://www.google.com/search?q={quote_plus(job_title)}+site:talent.*+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'paylocity': f'https://www.google.com/search?q={quote_plus(job_title)}+site:recruiting.paylocity.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'keka': f'https://www.google.com/search?q={quote_plus(job_title)}+site:keka.com+{quote_plus(job_type)}+{quote_plus(location)}{experience_query}&tbs=qdr:{quote_plus(posting)}', 'workable': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.workable.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'breezyHR': f'https://www.google.com/search?q={quote_plus(job_title)}+site:breezy.hr+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'wellfound': f'https://www.google.com/search?q={quote_plus(job_title)}+site:wellfound.com+{quote_plus(job_type)}+{quote_plus(location)}&tbs=qdr:{quote_plus(posting)}', 'y_combinator': f'https://www.google.com/search?q={quote_plus(job_title)}+site:workatastartup.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'oracle_cloud': f'https://www.google.com/search?q={quote_plus(job_title)}+site:oraclecloud.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'workday': f'https://www.google.com/search?q={quote_plus(job_title)}+site:myworkdayjobs.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'recruitee': f'https://www.google.com/search?q={quote_plus(job_title)}+site:recruitee.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'rippling': f'https://www.google.com/search?q={quote_plus(job_title)}+(site%3Arippling.com%20OR%20site%3Arippling-ats.com)+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'gusto': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.gusto.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'teamtailor': f'https://www.google.com/search?q={quote_plus(job_title)}+site:teamtailor.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'smartrecruiters': f'https://www.google.com/search?q={quote_plus(job_title)}+site:jobs.smartrecruiters.com+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'builtin': f'https://www.google.com/search?q={quote_plus(job_title)}+site:builtin.com/job/+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'glassdoor': f'https://www.google.com/search?q={quote_plus(job_title)}+site:glassdoor.com/job-listing/+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}', 'all_jobs': f'https://www.google.com/search?q={quote_plus(job_title)}+(site%3A*%2Femployment%2F*%20OR%20site%3A*%2Fopportunities%2F*%20OR%20site%3A*%2Fopenings%2F*%20OR%20site%3A*%2Fjoin-us%2F*%20OR%20site%3A*%2Fwork-with-us%2F*)+{quote_plus(job_type)}&tbs=qdr:{quote_plus(posting)}' } return job_portal_with_link.get(portal, "") def search_jobs_interface(job_title, job_type, location, posting, experience_level, selected_portals, num_results): """Main function to handle the Gradio interface""" if not job_title.strip(): return "āŒ Please enter a job title", None, "

āŒ Please enter a job title

" if not selected_portals: return "āŒ Please select at least one job portal", None, "

āŒ Please select at least one job portal

" all_results = [] progress_messages = [] for portal in selected_portals: try: progress_messages.append(f"šŸ” Searching {portal}...") # Get search URL for the portal search_url = search_job(portal, job_title, job_type, location, posting, experience_level) if search_url: # Use the selenium function to get job URLs urls = get_search_urls(search_url, num_results=num_results, is_query=False, headless=True) for url in urls: all_results.append({ 'Portal': portal.title(), 'Job Title': job_title, 'Location': location, 'Job Type': job_type, 'Experience Level': experience_level, 'URL': url # Keep raw URL for DataFrame if needed }) progress_messages.append(f"āœ… Found {len(urls)} jobs on {portal}") else: progress_messages.append(f"āŒ Invalid portal: {portal}") except Exception as e: progress_messages.append(f"āŒ Error searching {portal}: {str(e)}") # Create progress summary progress_summary = "\n".join(progress_messages) progress_summary += f"\n\nšŸ“Š Total Results: {len(all_results)} jobs found" # Generate HTML table with clickable links if all_results: df = pd.DataFrame(all_results) # Create HTML table string html_table = "" html_table += "" for col in df.columns: html_table += f"" html_table += "" for _, row in df.iterrows(): html_table += "" for col in df.columns: cell_value = row[col] if col == 'URL': # Make URL clickable html_table += f"" else: html_table += f"" html_table += "" html_table += "
{col}
Apply Now{cell_value}
" return progress_summary, df, html_table # Return DataFrame and HTML else: no_results_html = "

āŒ No jobs found. Try different search parameters.

" return progress_summary + "\n\nāŒ No jobs found. Try different search parameters.", None, no_results_html # --- Define the Gradio interface with Tabs --- def create_gradio_interface(): # Available job portals for All Jobs Scraper job_portals = [ 'indeed', 'greenhouse', 'lever', 'ashby', 'pinpoint', 'job_subdomain', 'careers_page', 'talent_subdomain', 'paylocity', 'keka', 'workable', 'breezyHR', 'wellfound', 'y_combinator', 'oracle_cloud', 'workday', 'recruitee', 'rippling', 'gusto', 'teamtailor', 'smartrecruiters', 'builtin', 'glassdoor', 'all_jobs' ] with gr.Blocks(title="AI Job Search Engine", theme=gr.themes.Soft()) as app: gr.Markdown("# šŸš€ AI-Powered Job Search Engine") with gr.Tabs(): # --- Tab 1: LinkedIn Jobs Scraper --- with gr.TabItem("LinkedIn Jobs"): gr.Markdown("## šŸ” Search Jobs on LinkedIn") with gr.Row(): with gr.Column(scale=2): # LinkedIn search parameters linkedin_job_title = gr.Textbox( label="šŸ’¼ Job Title", placeholder="e.g., AI ML Engineer, Data Scientist", value="AI ML Engineer" ) with gr.Row(): linkedin_location = gr.Textbox( label="šŸ“ Location", placeholder="e.g., Pune, Mumbai, Bangalore", value="Pune" ) linkedin_date_posted = gr.Dropdown( label="šŸ“… Posted Within", choices=["Any Time", "Recent (Last 24 hours)","7 Hour ago","12 hour ago", "Past Week", "Past Month"], value="Past Week" ) with gr.Row(): # LinkedIn uses 0-9 for experience levels linkedin_experience_level = gr.Dropdown( label="⭐ Experience Level (Years)", choices=[str(i) for i in range(10)], # 0 to 9 value="0" # Default to Entry Level (0) ) # Placeholder for future inputs if needed dummy = gr.Textbox(visible=False) # Or remove this row if not needed linkedin_search_btn = gr.Button("šŸ” Search LinkedIn Jobs", variant="primary") with gr.Column(scale=3): # LinkedIn Results section linkedin_result_msg = gr.Textbox( label="šŸ“ˆ Message", lines=2, max_lines=5, interactive=False ) linkedin_result_display = gr.Markdown( label="šŸ“‹ Job Listings" ) # Connect LinkedIn search function linkedin_search_btn.click( fn=format_results, inputs=[linkedin_job_title, linkedin_location, linkedin_date_posted, linkedin_experience_level], outputs=[linkedin_result_msg, linkedin_result_display] ) # --- Tab 2: All Jobs Scraper (Google-based) --- with gr.TabItem("All Jobs (Google Search)"): gr.Markdown("## 🌐 Search Jobs across the Web (via Google)") with gr.Row(): with gr.Column(scale=2): # Job search parameters (your original ones) job_title = gr.Textbox( label="šŸ’¼ Job Title", placeholder="e.g., AI ML Engineer, Data Scientist, Software Developer", value="AI ML" ) with gr.Row(): job_type = gr.Dropdown( label="šŸ¢ Job Type", choices=["remote", "on-site", "hybrid", "any"], value="remote" ) location = gr.Textbox( label="šŸ“ Location", placeholder="e.g., Pune, Mumbai, Bangalore", value="pune" ) with gr.Row(): posting = gr.Dropdown( label="šŸ“… Posted Within", choices=[('4 hour ago','h4'),('8 hour ago','h8'),('12 hour ago','h12'),("Last 24 hours", "d"),('2 days ago','h48'),('3 days ago' , 'h72'), ("Last week", "w"), ("Last month", "m"), ("Any time", "")], value="d" ) experience_level = gr.Dropdown( label="⭐ Experience Level", choices=["Any", "Entry Level", "Mid Level", "Senior Level", "Executive"], value="Any" ) # Job portals selection selected_portals = gr.CheckboxGroup( label="🌐 Select Job Portals", choices=job_portals, value=['indeed', 'greenhouse', 'lever', 'builtin', 'glassdoor', 'job_subdomain', 'careers_page', 'talent_subdomain', 'paylocity', 'keka', 'workable', 'breezyHR', 'wellfound', 'y_combinator', 'oracle_cloud', 'workday', 'recruitee', 'rippling', 'gusto', 'teamtailor', 'smartrecruiters','all_jobs'] ) num_results = gr.Slider( label="šŸ“Š Results per Portal", minimum=1, maximum=30, value=20, step=1 ) # Search button search_btn = gr.Button("šŸ” Search Jobs", variant="primary", size="lg") with gr.Column(scale=3): # Results section (your original ones) progress_output = gr.Textbox( label="šŸ“ˆ Search Progress", lines=10, max_lines=15, interactive=False ) # HTML component for clickable links html_output = gr.HTML( label="šŸ“‹ Clickable Job Results" ) # Connect the search function (your original connection) search_btn.click( fn=search_jobs_interface, inputs=[job_title, job_type, location, posting, experience_level, selected_portals, num_results], outputs=[progress_output, gr.Dataframe(visible=False), html_output] ) return app # Launch the application if __name__ == "__main__": app = create_gradio_interface() app.launch()