OppaAI commited on
Commit
064e5aa
·
verified ·
1 Parent(s): 55479d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -60
app.py CHANGED
@@ -1,9 +1,9 @@
1
  import os
2
  import gradio as gr
3
- import asyncio
 
4
  from urllib.parse import urlencode
5
  from fastmcp import FastMCP
6
- from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
7
 
8
  # Initialize FastMCP agent
9
  mcp = FastMCP("Indeed Web Scraper Agent")
@@ -11,7 +11,7 @@ mcp = FastMCP("Indeed Web Scraper Agent")
11
  @mcp.tool(name="search_jobs")
12
  def search_jobs_tool(query: str, location: str, limit: int = 10, salary: str = None, job_type: str = None):
13
  """
14
- Scrape jobs from Indeed website using crawl4ai based on query, location, and optional filters.
15
 
16
  Args:
17
  query (str): Job title or keywords to search for.
@@ -25,71 +25,48 @@ def search_jobs_tool(query: str, location: str, limit: int = 10, salary: str = N
25
  """
26
  base_url = "https://ca.indeed.com/jobs?"
27
 
28
- # Build query parameters
29
  params = {
30
  "q": query,
31
  "l": location,
32
- "sort": "date", # sort by most recent
33
  }
 
34
  url = base_url + urlencode(params)
 
 
 
35
 
36
- async def crawl_indeed():
37
- browser_cfg = BrowserConfig(headless=True, text_mode=True)
38
- crawler_cfg = CrawlerRunConfig(
39
- scan_full_page=True,
40
- delay_before_return_html=2.0,
41
- cache_mode=CacheMode.BYPASS,
42
- remove_overlay_elements=True,
43
- exclude_external_links=True,
44
- exclude_social_media_links=True
45
- )
46
-
47
- async with AsyncWebCrawler(config=browser_cfg) as crawler:
48
- result = await crawler.arun(url, config=crawler_cfg)
49
- if not result.success:
50
- return {"error": result.error_message}
51
- html = result.html
52
-
53
- # Parse job cards manually with BeautifulSoup from crawl4ai html
54
- from bs4 import BeautifulSoup
55
- soup = BeautifulSoup(html, "html.parser")
56
-
57
- jobs = []
58
- cards = soup.find_all("a", class_="tapItem")
59
- for card in cards[:limit]:
60
- title_elem = card.find("h2", class_="jobTitle")
61
- company_elem = card.find("span", class_="companyName")
62
- location_elem = card.find("div", class_="companyLocation")
63
- link = card.get("href")
64
- if link and not link.startswith("http"):
65
- link = "https://ca.indeed.com" + link
66
- job = {
67
- "title": title_elem.get_text(strip=True) if title_elem else "No Title",
68
- "company": company_elem.get_text(strip=True) if company_elem else "Unknown Company",
69
- "location": location_elem.get_text(strip=True) if location_elem else "Unknown Location",
70
- "url": link or "#",
71
- }
72
- jobs.append(job)
73
- return {"jobs": jobs}
74
-
75
- # Run async crawl in sync context (FastMCP expects sync)
76
- return asyncio.run(crawl_indeed())
77
 
 
78
 
79
- def search_jobs_ui(query, location, limit=10, salary=None, job_type=None):
80
- """
81
- Gradio UI handler for scraping Indeed jobs using FastMCP.
 
 
 
 
 
 
82
 
83
- Args:
84
- query (str): Job title or keyword.
85
- location (str): Job location.
86
- limit (int, optional): Number of jobs to retrieve (default 10).
87
- salary (str, optional): Not used.
88
- job_type (str, optional): Not used.
 
89
 
90
- Returns:
91
- str: Markdown-formatted list of jobs or error message.
92
- """
 
 
 
 
93
  result = search_jobs_tool(query, location, limit, salary, job_type)
94
 
95
  if "error" in result:
@@ -117,8 +94,8 @@ app = gr.Interface(
117
  gr.Textbox(label="Job Type (optional, ignored)")
118
  ],
119
  outputs="markdown",
120
- title="Indeed Job Search (Web Scraping with Crawl4AI) + FastMCP",
121
- description="Search jobs by scraping Indeed.ca with crawl4ai. Results sorted by most recent."
122
  )
123
 
124
  if __name__ == "__main__":
 
1
  import os
2
  import gradio as gr
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
  from urllib.parse import urlencode
6
  from fastmcp import FastMCP
 
7
 
8
  # Initialize FastMCP agent
9
  mcp = FastMCP("Indeed Web Scraper Agent")
 
11
  @mcp.tool(name="search_jobs")
12
  def search_jobs_tool(query: str, location: str, limit: int = 10, salary: str = None, job_type: str = None):
13
  """
14
+ Scrape jobs from Indeed website using requests + BeautifulSoup.
15
 
16
  Args:
17
  query (str): Job title or keywords to search for.
 
25
  """
26
  base_url = "https://ca.indeed.com/jobs?"
27
 
 
28
  params = {
29
  "q": query,
30
  "l": location,
31
+ "sort": "date",
32
  }
33
+
34
  url = base_url + urlencode(params)
35
+ headers = {
36
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
37
+ }
38
 
39
+ try:
40
+ response = requests.get(url, headers=headers)
41
+ response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ soup = BeautifulSoup(response.text, "html.parser")
44
 
45
+ jobs = []
46
+ cards = soup.find_all("a", class_="tapItem")
47
+ for card in cards[:limit]:
48
+ title_elem = card.find("h2", class_="jobTitle")
49
+ company_elem = card.find("span", class_="companyName")
50
+ location_elem = card.find("div", class_="companyLocation")
51
+ link = card.get("href")
52
+ if link and not link.startswith("http"):
53
+ link = "https://ca.indeed.com" + link
54
 
55
+ job = {
56
+ "title": title_elem.get_text(strip=True) if title_elem else "No Title",
57
+ "company": company_elem.get_text(strip=True) if company_elem else "Unknown Company",
58
+ "location": location_elem.get_text(strip=True) if location_elem else "Unknown Location",
59
+ "url": link or "#"
60
+ }
61
+ jobs.append(job)
62
 
63
+ return {"jobs": jobs}
64
+
65
+ except Exception as e:
66
+ return {"error": str(e)}
67
+
68
+
69
+ def search_jobs_ui(query, location, limit=10, salary=None, job_type=None):
70
  result = search_jobs_tool(query, location, limit, salary, job_type)
71
 
72
  if "error" in result:
 
94
  gr.Textbox(label="Job Type (optional, ignored)")
95
  ],
96
  outputs="markdown",
97
+ title="Indeed Job Search (with BeautifulSoup) + FastMCP",
98
+ description="Search jobs by scraping Indeed.ca using requests and BeautifulSoup."
99
  )
100
 
101
  if __name__ == "__main__":