Spaces:

OppaAI
/

Job-Search-MCP-Server

Sleeping

App Files Files Community

OppaAI commited on Jun 8, 2025

Commit

5638525

verified ·

1 Parent(s): bbb7a7b

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -38

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import os
 import gradio as gr
-import requests
-import FastMCP
-from bs4 import BeautifulSoup
-from datetime import datetime
 from urllib.parse import urlencode
 # Initialize FastMCP agent
 mcp = FastMCP("Indeed Web Scraper Agent")
@@ -12,7 +11,7 @@ mcp = FastMCP("Indeed Web Scraper Agent")
 @mcp.tool(name="search_jobs")
 def search_jobs_tool(query: str, location: str, limit: int = 10, salary: str = None, job_type: str = None):
     """
-    Scrape jobs from Indeed website based on query, location, and optional filters.
     Args:
         query (str): Job title or keywords to search for.
@@ -34,38 +33,48 @@ def search_jobs_tool(query: str, location: str, limit: int = 10, salary: str = N
     }
     url = base_url + urlencode(params)
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
-    }
-    try:
-        response = requests.get(url, headers=headers)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, "html.parser")
-        jobs = []
-        cards = soup.find_all("a", class_="tapItem", limit=limit)  # job cards
-        for card in cards:
-            title_elem = card.find("h2", class_="jobTitle")
-            company_elem = card.find("span", class_="companyName")
-            location_elem = card.find("div", class_="companyLocation")
-            link = card.get("href")
-            if not link.startswith("http"):
-                link = "https://ca.indeed.com" + link
-            job = {
-                "title": title_elem.get_text(strip=True) if title_elem else "No Title",
-                "company": company_elem.get_text(strip=True) if company_elem else "Unknown Company",
-                "location": location_elem.get_text(strip=True) if location_elem else "Unknown Location",
-                "url": link,
-            }
-            jobs.append(job)
-        return {"jobs": jobs}
-    except Exception as e:
-        return {"error": str(e)}
 def search_jobs_ui(query, location, limit=10, salary=None, job_type=None):
     """
@@ -96,6 +105,7 @@ def search_jobs_ui(query, location, limit=10, salary=None, job_type=None):
     return output
 # Gradio Interface
 app = gr.Interface(
     fn=search_jobs_ui,
@@ -107,8 +117,8 @@ app = gr.Interface(
         gr.Textbox(label="Job Type (optional, ignored)")
     ],
     outputs="markdown",
-    title="Indeed Job Search (Web Scraping) with FastMCP",
-    description="Search jobs by scraping Indeed.ca. Results sorted by most recent."
 )
 if __name__ == "__main__":

 import os
 import gradio as gr
+import asyncio
 from urllib.parse import urlencode
+from fastmcp import FastMCP
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 # Initialize FastMCP agent
 mcp = FastMCP("Indeed Web Scraper Agent")
 @mcp.tool(name="search_jobs")
 def search_jobs_tool(query: str, location: str, limit: int = 10, salary: str = None, job_type: str = None):
     """
+    Scrape jobs from Indeed website using crawl4ai based on query, location, and optional filters.
     Args:
         query (str): Job title or keywords to search for.
     }
     url = base_url + urlencode(params)
+    async def crawl_indeed():
+        browser_cfg = BrowserConfig(headless=True, text_mode=True)
+        crawler_cfg = CrawlerRunConfig(
+            scan_full_page=True,
+            delay_before_return_html=2.0,
+            cache_mode=CacheMode.BYPASS,
+            remove_overlay_elements=True,
+            exclude_external_links=True,
+            exclude_social_media_links=True
+        )
+        async with AsyncWebCrawler(config=browser_cfg) as crawler:
+            result = await crawler.arun(url, config=crawler_cfg)
+            if not result.success:
+                return {"error": result.error_message}
+            html = result.html
+            # Parse job cards manually with BeautifulSoup from crawl4ai html
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(html, "html.parser")
+            jobs = []
+            cards = soup.find_all("a", class_="tapItem")
+            for card in cards[:limit]:
+                title_elem = card.find("h2", class_="jobTitle")
+                company_elem = card.find("span", class_="companyName")
+                location_elem = card.find("div", class_="companyLocation")
+                link = card.get("href")
+                if link and not link.startswith("http"):
+                    link = "https://ca.indeed.com" + link
+                job = {
+                    "title": title_elem.get_text(strip=True) if title_elem else "No Title",
+                    "company": company_elem.get_text(strip=True) if company_elem else "Unknown Company",
+                    "location": location_elem.get_text(strip=True) if location_elem else "Unknown Location",
+                    "url": link or "#",
+                }
+                jobs.append(job)
+            return {"jobs": jobs}
+    # Run async crawl in sync context (FastMCP expects sync)
+    return asyncio.run(crawl_indeed())
 def search_jobs_ui(query, location, limit=10, salary=None, job_type=None):
     """
     return output
 # Gradio Interface
 app = gr.Interface(
     fn=search_jobs_ui,
         gr.Textbox(label="Job Type (optional, ignored)")
     ],
     outputs="markdown",
+    title="Indeed Job Search (Web Scraping with Crawl4AI) + FastMCP",
+    description="Search jobs by scraping Indeed.ca with crawl4ai. Results sorted by most recent."
 )
 if __name__ == "__main__":