OppaAI's picture
Update app.py
5638525 verified
raw
history blame
4.56 kB
import os
import gradio as gr
import asyncio
from urllib.parse import urlencode
from fastmcp import FastMCP
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
# Initialize FastMCP agent
mcp = FastMCP("Indeed Web Scraper Agent")
@mcp.tool(name="search_jobs")
def search_jobs_tool(query: str, location: str, limit: int = 10, salary: str = None, job_type: str = None):
"""
Scrape jobs from Indeed website using crawl4ai based on query, location, and optional filters.
Args:
query (str): Job title or keywords to search for.
location (str): Location (city, region) to search jobs in.
limit (int): Number of job results to return (default 10).
salary (str, optional): Not used in scraper, reserved for future.
job_type (str, optional): Not used in scraper, reserved for future.
Returns:
dict: Contains a list of jobs with title, company, location, and url.
"""
base_url = "https://ca.indeed.com/jobs?"
# Build query parameters
params = {
"q": query,
"l": location,
"sort": "date", # sort by most recent
}
url = base_url + urlencode(params)
async def crawl_indeed():
browser_cfg = BrowserConfig(headless=True, text_mode=True)
crawler_cfg = CrawlerRunConfig(
scan_full_page=True,
delay_before_return_html=2.0,
cache_mode=CacheMode.BYPASS,
remove_overlay_elements=True,
exclude_external_links=True,
exclude_social_media_links=True
)
async with AsyncWebCrawler(config=browser_cfg) as crawler:
result = await crawler.arun(url, config=crawler_cfg)
if not result.success:
return {"error": result.error_message}
html = result.html
# Parse job cards manually with BeautifulSoup from crawl4ai html
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
jobs = []
cards = soup.find_all("a", class_="tapItem")
for card in cards[:limit]:
title_elem = card.find("h2", class_="jobTitle")
company_elem = card.find("span", class_="companyName")
location_elem = card.find("div", class_="companyLocation")
link = card.get("href")
if link and not link.startswith("http"):
link = "https://ca.indeed.com" + link
job = {
"title": title_elem.get_text(strip=True) if title_elem else "No Title",
"company": company_elem.get_text(strip=True) if company_elem else "Unknown Company",
"location": location_elem.get_text(strip=True) if location_elem else "Unknown Location",
"url": link or "#",
}
jobs.append(job)
return {"jobs": jobs}
# Run async crawl in sync context (FastMCP expects sync)
return asyncio.run(crawl_indeed())
def search_jobs_ui(query, location, limit=10, salary=None, job_type=None):
"""
Gradio UI handler for scraping Indeed jobs using FastMCP.
Args:
query (str): Job title or keyword.
location (str): Job location.
limit (int, optional): Number of jobs to retrieve (default 10).
salary (str, optional): Not used.
job_type (str, optional): Not used.
Returns:
str: Markdown-formatted list of jobs or error message.
"""
result = search_jobs_tool(query, location, limit, salary, job_type)
if "error" in result:
return f"โŒ Error: {result['error']}"
jobs = result.get("jobs", [])
if not jobs:
return "No jobs found for your search."
output = ""
for job in jobs:
output += f"**{job['title']}** at *{job['company']}*\n๐Ÿ“ {job['location']}\n[Apply Here]({job['url']})\n\n"
return output
# Gradio Interface
app = gr.Interface(
fn=search_jobs_ui,
inputs=[
gr.Textbox(label="Job Title / Keyword"),
gr.Textbox(label="Location"),
gr.Number(value=10, label="Number of Results (limit)", precision=0),
gr.Textbox(label="Salary (optional, ignored)"),
gr.Textbox(label="Job Type (optional, ignored)")
],
outputs="markdown",
title="Indeed Job Search (Web Scraping with Crawl4AI) + FastMCP",
description="Search jobs by scraping Indeed.ca with crawl4ai. Results sorted by most recent."
)
if __name__ == "__main__":
app.launch(mcp_server=True)