Spaces:
Running
Running
| import os | |
| import gradio as gr | |
| import asyncio | |
| from urllib.parse import urlencode | |
| from fastmcp import FastMCP | |
| from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode | |
| # Initialize FastMCP agent | |
| mcp = FastMCP("Indeed Web Scraper Agent") | |
| def search_jobs_tool(query: str, location: str, limit: int = 10, salary: str = None, job_type: str = None): | |
| """ | |
| Scrape jobs from Indeed website using crawl4ai based on query, location, and optional filters. | |
| Args: | |
| query (str): Job title or keywords to search for. | |
| location (str): Location (city, region) to search jobs in. | |
| limit (int): Number of job results to return (default 10). | |
| salary (str, optional): Not used in scraper, reserved for future. | |
| job_type (str, optional): Not used in scraper, reserved for future. | |
| Returns: | |
| dict: Contains a list of jobs with title, company, location, and url. | |
| """ | |
| base_url = "https://ca.indeed.com/jobs?" | |
| # Build query parameters | |
| params = { | |
| "q": query, | |
| "l": location, | |
| "sort": "date", # sort by most recent | |
| } | |
| url = base_url + urlencode(params) | |
| async def crawl_indeed(): | |
| browser_cfg = BrowserConfig(headless=True, text_mode=True) | |
| crawler_cfg = CrawlerRunConfig( | |
| scan_full_page=True, | |
| delay_before_return_html=2.0, | |
| cache_mode=CacheMode.BYPASS, | |
| remove_overlay_elements=True, | |
| exclude_external_links=True, | |
| exclude_social_media_links=True | |
| ) | |
| async with AsyncWebCrawler(config=browser_cfg) as crawler: | |
| result = await crawler.arun(url, config=crawler_cfg) | |
| if not result.success: | |
| return {"error": result.error_message} | |
| html = result.html | |
| # Parse job cards manually with BeautifulSoup from crawl4ai html | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(html, "html.parser") | |
| jobs = [] | |
| cards = soup.find_all("a", class_="tapItem") | |
| for card in cards[:limit]: | |
| title_elem = card.find("h2", class_="jobTitle") | |
| company_elem = card.find("span", class_="companyName") | |
| location_elem = card.find("div", class_="companyLocation") | |
| link = card.get("href") | |
| if link and not link.startswith("http"): | |
| link = "https://ca.indeed.com" + link | |
| job = { | |
| "title": title_elem.get_text(strip=True) if title_elem else "No Title", | |
| "company": company_elem.get_text(strip=True) if company_elem else "Unknown Company", | |
| "location": location_elem.get_text(strip=True) if location_elem else "Unknown Location", | |
| "url": link or "#", | |
| } | |
| jobs.append(job) | |
| return {"jobs": jobs} | |
| # Run async crawl in sync context (FastMCP expects sync) | |
| return asyncio.run(crawl_indeed()) | |
| def search_jobs_ui(query, location, limit=10, salary=None, job_type=None): | |
| """ | |
| Gradio UI handler for scraping Indeed jobs using FastMCP. | |
| Args: | |
| query (str): Job title or keyword. | |
| location (str): Job location. | |
| limit (int, optional): Number of jobs to retrieve (default 10). | |
| salary (str, optional): Not used. | |
| job_type (str, optional): Not used. | |
| Returns: | |
| str: Markdown-formatted list of jobs or error message. | |
| """ | |
| result = search_jobs_tool(query, location, limit, salary, job_type) | |
| if "error" in result: | |
| return f"โ Error: {result['error']}" | |
| jobs = result.get("jobs", []) | |
| if not jobs: | |
| return "No jobs found for your search." | |
| output = "" | |
| for job in jobs: | |
| output += f"**{job['title']}** at *{job['company']}*\n๐ {job['location']}\n[Apply Here]({job['url']})\n\n" | |
| return output | |
| # Gradio Interface | |
| app = gr.Interface( | |
| fn=search_jobs_ui, | |
| inputs=[ | |
| gr.Textbox(label="Job Title / Keyword"), | |
| gr.Textbox(label="Location"), | |
| gr.Number(value=10, label="Number of Results (limit)", precision=0), | |
| gr.Textbox(label="Salary (optional, ignored)"), | |
| gr.Textbox(label="Job Type (optional, ignored)") | |
| ], | |
| outputs="markdown", | |
| title="Indeed Job Search (Web Scraping with Crawl4AI) + FastMCP", | |
| description="Search jobs by scraping Indeed.ca with crawl4ai. Results sorted by most recent." | |
| ) | |
| if __name__ == "__main__": | |
| app.launch(mcp_server=True) | |