OppaAI commited on
Commit
5638525
·
verified ·
1 Parent(s): bbb7a7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -38
app.py CHANGED
@@ -1,10 +1,9 @@
1
  import os
2
  import gradio as gr
3
- import requests
4
- import FastMCP
5
- from bs4 import BeautifulSoup
6
- from datetime import datetime
7
  from urllib.parse import urlencode
 
 
8
 
9
  # Initialize FastMCP agent
10
  mcp = FastMCP("Indeed Web Scraper Agent")
@@ -12,7 +11,7 @@ mcp = FastMCP("Indeed Web Scraper Agent")
12
  @mcp.tool(name="search_jobs")
13
  def search_jobs_tool(query: str, location: str, limit: int = 10, salary: str = None, job_type: str = None):
14
  """
15
- Scrape jobs from Indeed website based on query, location, and optional filters.
16
 
17
  Args:
18
  query (str): Job title or keywords to search for.
@@ -34,38 +33,48 @@ def search_jobs_tool(query: str, location: str, limit: int = 10, salary: str = N
34
  }
35
  url = base_url + urlencode(params)
36
 
37
- headers = {
38
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
39
- }
40
-
41
- try:
42
- response = requests.get(url, headers=headers)
43
- response.raise_for_status()
44
- soup = BeautifulSoup(response.text, "html.parser")
45
-
46
- jobs = []
47
- cards = soup.find_all("a", class_="tapItem", limit=limit) # job cards
48
-
49
- for card in cards:
50
- title_elem = card.find("h2", class_="jobTitle")
51
- company_elem = card.find("span", class_="companyName")
52
- location_elem = card.find("div", class_="companyLocation")
53
- link = card.get("href")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- if not link.startswith("http"):
56
- link = "https://ca.indeed.com" + link
57
-
58
- job = {
59
- "title": title_elem.get_text(strip=True) if title_elem else "No Title",
60
- "company": company_elem.get_text(strip=True) if company_elem else "Unknown Company",
61
- "location": location_elem.get_text(strip=True) if location_elem else "Unknown Location",
62
- "url": link,
63
- }
64
- jobs.append(job)
65
-
66
- return {"jobs": jobs}
67
- except Exception as e:
68
- return {"error": str(e)}
69
 
70
  def search_jobs_ui(query, location, limit=10, salary=None, job_type=None):
71
  """
@@ -96,6 +105,7 @@ def search_jobs_ui(query, location, limit=10, salary=None, job_type=None):
96
 
97
  return output
98
 
 
99
  # Gradio Interface
100
  app = gr.Interface(
101
  fn=search_jobs_ui,
@@ -107,8 +117,8 @@ app = gr.Interface(
107
  gr.Textbox(label="Job Type (optional, ignored)")
108
  ],
109
  outputs="markdown",
110
- title="Indeed Job Search (Web Scraping) with FastMCP",
111
- description="Search jobs by scraping Indeed.ca. Results sorted by most recent."
112
  )
113
 
114
  if __name__ == "__main__":
 
1
  import os
2
  import gradio as gr
3
+ import asyncio
 
 
 
4
  from urllib.parse import urlencode
5
+ from fastmcp import FastMCP
6
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
7
 
8
  # Initialize FastMCP agent
9
  mcp = FastMCP("Indeed Web Scraper Agent")
 
11
  @mcp.tool(name="search_jobs")
12
  def search_jobs_tool(query: str, location: str, limit: int = 10, salary: str = None, job_type: str = None):
13
  """
14
+ Scrape jobs from Indeed website using crawl4ai based on query, location, and optional filters.
15
 
16
  Args:
17
  query (str): Job title or keywords to search for.
 
33
  }
34
  url = base_url + urlencode(params)
35
 
36
+ async def crawl_indeed():
37
+ browser_cfg = BrowserConfig(headless=True, text_mode=True)
38
+ crawler_cfg = CrawlerRunConfig(
39
+ scan_full_page=True,
40
+ delay_before_return_html=2.0,
41
+ cache_mode=CacheMode.BYPASS,
42
+ remove_overlay_elements=True,
43
+ exclude_external_links=True,
44
+ exclude_social_media_links=True
45
+ )
46
+
47
+ async with AsyncWebCrawler(config=browser_cfg) as crawler:
48
+ result = await crawler.arun(url, config=crawler_cfg)
49
+ if not result.success:
50
+ return {"error": result.error_message}
51
+ html = result.html
52
+
53
+ # Parse job cards manually with BeautifulSoup from crawl4ai html
54
+ from bs4 import BeautifulSoup
55
+ soup = BeautifulSoup(html, "html.parser")
56
+
57
+ jobs = []
58
+ cards = soup.find_all("a", class_="tapItem")
59
+ for card in cards[:limit]:
60
+ title_elem = card.find("h2", class_="jobTitle")
61
+ company_elem = card.find("span", class_="companyName")
62
+ location_elem = card.find("div", class_="companyLocation")
63
+ link = card.get("href")
64
+ if link and not link.startswith("http"):
65
+ link = "https://ca.indeed.com" + link
66
+ job = {
67
+ "title": title_elem.get_text(strip=True) if title_elem else "No Title",
68
+ "company": company_elem.get_text(strip=True) if company_elem else "Unknown Company",
69
+ "location": location_elem.get_text(strip=True) if location_elem else "Unknown Location",
70
+ "url": link or "#",
71
+ }
72
+ jobs.append(job)
73
+ return {"jobs": jobs}
74
+
75
+ # Run async crawl in sync context (FastMCP expects sync)
76
+ return asyncio.run(crawl_indeed())
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  def search_jobs_ui(query, location, limit=10, salary=None, job_type=None):
80
  """
 
105
 
106
  return output
107
 
108
+
109
  # Gradio Interface
110
  app = gr.Interface(
111
  fn=search_jobs_ui,
 
117
  gr.Textbox(label="Job Type (optional, ignored)")
118
  ],
119
  outputs="markdown",
120
+ title="Indeed Job Search (Web Scraping with Crawl4AI) + FastMCP",
121
+ description="Search jobs by scraping Indeed.ca with crawl4ai. Results sorted by most recent."
122
  )
123
 
124
  if __name__ == "__main__":