import re import requests from bs4 import BeautifulSoup from crewai.tools import BaseTool from pydantic import BaseModel, Field from typing import Type class WebScraperInput(BaseModel): website_url: str = Field(description="The URL of the website to scrape") class CleanWebScraperTool(BaseTool): """Scrapes a website using a Playwright headless browser so JavaScript renders and 'See more' buttons get clicked before extraction. Falls back to a plain requests fetch if Playwright is unavailable.""" name: str = "Scrape and Clean Website" description: str = ( "Scrapes a website URL using a headless browser, expands collapsed " "sections (e.g. LinkedIn 'See more'), and returns cleaned text content." ) args_schema: Type[BaseModel] = WebScraperInput def _run(self, website_url: str) -> str: try: return self._scrape_with_playwright(website_url) except Exception: try: return self._scrape_with_requests(website_url) except Exception as e: return f"Error scraping website: {e}" # ------------------------------------------------------------------ # Playwright path # ------------------------------------------------------------------ def _scrape_with_playwright(self, url: str) -> str: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch(headless=True) context = browser.new_context( user_agent=( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), viewport={"width": 1280, "height": 800}, ) page = context.new_page() try: page.goto(url, wait_until="domcontentloaded", timeout=30000) page.wait_for_timeout(2000) self._expand_see_more(page) if "linkedin.com" in url: text = self._extract_linkedin_jd(page) if text: browser.close() return self._clean_content(text) text = page.locator("body").inner_text(timeout=10000) finally: browser.close() return self._clean_content(text) def _expand_see_more(self, page) -> None: """Click any visible 'See more' / 'Show more' expand buttons.""" selectors = [ "button.show-more-less-button", "button[aria-label*='See more']", "button[aria-label*='Show more']", "button:has-text('See more')", "button:has-text('Show more')", ] for selector in selectors: try: for btn in page.locator(selector).all(): if btn.is_visible(): btn.click(timeout=2000) page.wait_for_timeout(500) except Exception: pass def _extract_linkedin_jd(self, page) -> str: """Extract only the job description block from a LinkedIn page.""" selectors = [ ".show-more-less-html__markup", ".description__text", "[data-test-id='job-description']", ".job-details-jobs-unified-top-card__job-description", ] for selector in selectors: try: el = page.locator(selector).first if el.is_visible(timeout=2000): return el.inner_text(timeout=3000) except Exception: pass return "" # ------------------------------------------------------------------ # Requests fallback # ------------------------------------------------------------------ def _scrape_with_requests(self, url: str) -> str: headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", } response = requests.get(url, headers=headers, timeout=15) soup = BeautifulSoup(response.text, "html.parser") return self._clean_content(soup.get_text(" ")) # ------------------------------------------------------------------ # Shared cleaning # ------------------------------------------------------------------ def _clean_content(self, raw_text: str, max_chars: int = 5000) -> str: """Remove navigation noise and truncate to control token usage.""" if not raw_text: return "" skip_patterns = [ "Sign in", "Join now", "Clear text", "Show more", "Show less", "Show fewer", "Similar jobs", "People also viewed", "Similar Searches", "More searches", "Agree & Join", "Set alert", "Get notified", "Forgot password?", "Cookie Policy", "Privacy Policy", "User Agreement", "Brand Policy", "Guest Controls", "Community Guidelines", "Get the app", "Referrals increase", "See who you know", "Sign in with Email", "New to LinkedIn", "open jobs", "LinkedIn is better on the app", "By clicking Continue", "Email or phone", "Password", "Expand search", "Skip to main content", ] language_patterns = [ "(Arabic)", "(Bangla)", "(Czech)", "(Danish)", "(German)", "(Greek)", "(English)", "(Spanish)", "(Persian)", "(Finnish)", "(French)", "(Hindi)", "(Hungarian)", "(Indonesian)", "(Italian)", "(Hebrew)", "(Japanese)", "(Korean)", "(Marathi)", "(Malay)", "(Dutch)", "(Norwegian)", "(Punjabi)", "(Polish)", "(Portuguese)", "(Romanian)", "(Russian)", "(Swedish)", "(Telugu)", "(Thai)", "(Tagalog)", "(Turkish)", "(Ukrainian)", "(Vietnamese)", "(Chinese", "Language", ] lines = raw_text.split("\n") cleaned = [] for line in lines: stripped = line.strip() if not stripped or len(stripped) < 3: continue if any(p in stripped for p in skip_patterns): continue if any(p in stripped for p in language_patterns): continue cleaned.append(stripped) result = "\n".join(cleaned) # Collapse runs of blank lines left after filtering result = re.sub(r"\n{3,}", "\n\n", result) if len(result) > max_chars: result = result[:max_chars] + "\n\n[Content truncated to save tokens]" return result