Spaces:

Chirag20
/

cvmatcher

Running

App Files Files Community

cvmatcher / src /tools /web_scraper.py

Chirag20

Initial deployment: job application intelligence agent

ec55f11 2 days ago

raw

history blame contribute delete

6.92 kB

	import re
	import requests
	from bs4 import BeautifulSoup
	from crewai.tools import BaseTool
	from pydantic import BaseModel, Field
	from typing import Type


	class WebScraperInput(BaseModel):
	website_url: str = Field(description="The URL of the website to scrape")


	class CleanWebScraperTool(BaseTool):
	"""Scrapes a website using a Playwright headless browser so JavaScript
	renders and 'See more' buttons get clicked before extraction.
	Falls back to a plain requests fetch if Playwright is unavailable."""

	name: str = "Scrape and Clean Website"
	description: str = (
	"Scrapes a website URL using a headless browser, expands collapsed "
	"sections (e.g. LinkedIn 'See more'), and returns cleaned text content."
	)
	args_schema: Type[BaseModel] = WebScraperInput

	def _run(self, website_url: str) -> str:
	try:
	return self._scrape_with_playwright(website_url)
	except Exception:
	try:
	return self._scrape_with_requests(website_url)
	except Exception as e:
	return f"Error scraping website: {e}"

	# ------------------------------------------------------------------
	# Playwright path
	# ------------------------------------------------------------------

	def _scrape_with_playwright(self, url: str) -> str:
	from playwright.sync_api import sync_playwright

	with sync_playwright() as p:
	browser = p.chromium.launch(headless=True)
	context = browser.new_context(
	user_agent=(
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/120.0.0.0 Safari/537.36"
	),
	viewport={"width": 1280, "height": 800},
	)
	page = context.new_page()
	try:
	page.goto(url, wait_until="domcontentloaded", timeout=30000)
	page.wait_for_timeout(2000)

	self._expand_see_more(page)

	if "linkedin.com" in url:
	text = self._extract_linkedin_jd(page)
	if text:
	browser.close()
	return self._clean_content(text)

	text = page.locator("body").inner_text(timeout=10000)
	finally:
	browser.close()

	return self._clean_content(text)

	def _expand_see_more(self, page) -> None:
	"""Click any visible 'See more' / 'Show more' expand buttons."""
	selectors = [
	"button.show-more-less-button",
	"button[aria-label*='See more']",
	"button[aria-label*='Show more']",
	"button:has-text('See more')",
	"button:has-text('Show more')",
	]
	for selector in selectors:
	try:
	for btn in page.locator(selector).all():
	if btn.is_visible():
	btn.click(timeout=2000)
	page.wait_for_timeout(500)
	except Exception:
	pass

	def _extract_linkedin_jd(self, page) -> str:
	"""Extract only the job description block from a LinkedIn page."""
	selectors = [
	".show-more-less-html__markup",
	".description__text",
	"[data-test-id='job-description']",
	".job-details-jobs-unified-top-card__job-description",
	]
	for selector in selectors:
	try:
	el = page.locator(selector).first
	if el.is_visible(timeout=2000):
	return el.inner_text(timeout=3000)
	except Exception:
	pass
	return ""

	# ------------------------------------------------------------------
	# Requests fallback
	# ------------------------------------------------------------------

	def _scrape_with_requests(self, url: str) -> str:
	headers = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/120.0.0.0 Safari/537.36"
	),
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.9",
	}
	response = requests.get(url, headers=headers, timeout=15)
	soup = BeautifulSoup(response.text, "html.parser")
	return self._clean_content(soup.get_text(" "))

	# ------------------------------------------------------------------
	# Shared cleaning
	# ------------------------------------------------------------------

	def _clean_content(self, raw_text: str, max_chars: int = 5000) -> str:
	"""Remove navigation noise and truncate to control token usage."""
	if not raw_text:
	return ""

	skip_patterns = [
	"Sign in", "Join now", "Clear text", "Show more", "Show less",
	"Show fewer", "Similar jobs", "People also viewed", "Similar Searches",
	"More searches", "Agree & Join", "Set alert", "Get notified",
	"Forgot password?", "Cookie Policy", "Privacy Policy",
	"User Agreement", "Brand Policy", "Guest Controls",
	"Community Guidelines", "Get the app", "Referrals increase",
	"See who you know", "Sign in with Email", "New to LinkedIn",
	"open jobs", "LinkedIn is better on the app",
	"By clicking Continue", "Email or phone", "Password",
	"Expand search", "Skip to main content",
	]
	language_patterns = [
	"(Arabic)", "(Bangla)", "(Czech)", "(Danish)", "(German)",
	"(Greek)", "(English)", "(Spanish)", "(Persian)", "(Finnish)",
	"(French)", "(Hindi)", "(Hungarian)", "(Indonesian)", "(Italian)",
	"(Hebrew)", "(Japanese)", "(Korean)", "(Marathi)", "(Malay)",
	"(Dutch)", "(Norwegian)", "(Punjabi)", "(Polish)", "(Portuguese)",
	"(Romanian)", "(Russian)", "(Swedish)", "(Telugu)", "(Thai)",
	"(Tagalog)", "(Turkish)", "(Ukrainian)", "(Vietnamese)",
	"(Chinese", "Language",
	]

	lines = raw_text.split("\n")
	cleaned = []
	for line in lines:
	stripped = line.strip()
	if not stripped or len(stripped) < 3:
	continue
	if any(p in stripped for p in skip_patterns):
	continue
	if any(p in stripped for p in language_patterns):
	continue
	cleaned.append(stripped)

	result = "\n".join(cleaned)
	# Collapse runs of blank lines left after filtering
	result = re.sub(r"\n{3,}", "\n\n", result)

	if len(result) > max_chars:
	result = result[:max_chars] + "\n\n[Content truncated to save tokens]"

	return result