| import re |
| import requests |
| from bs4 import BeautifulSoup |
| from crewai.tools import BaseTool |
| from pydantic import BaseModel, Field |
| from typing import Type |
|
|
|
|
| class WebScraperInput(BaseModel): |
| website_url: str = Field(description="The URL of the website to scrape") |
|
|
|
|
| class CleanWebScraperTool(BaseTool): |
| """Scrapes a website using a Playwright headless browser so JavaScript |
| renders and 'See more' buttons get clicked before extraction. |
| Falls back to a plain requests fetch if Playwright is unavailable.""" |
|
|
| name: str = "Scrape and Clean Website" |
| description: str = ( |
| "Scrapes a website URL using a headless browser, expands collapsed " |
| "sections (e.g. LinkedIn 'See more'), and returns cleaned text content." |
| ) |
| args_schema: Type[BaseModel] = WebScraperInput |
|
|
| def _run(self, website_url: str) -> str: |
| try: |
| return self._scrape_with_playwright(website_url) |
| except Exception: |
| try: |
| return self._scrape_with_requests(website_url) |
| except Exception as e: |
| return f"Error scraping website: {e}" |
|
|
| |
| |
| |
|
|
| def _scrape_with_playwright(self, url: str) -> str: |
| from playwright.sync_api import sync_playwright |
|
|
| with sync_playwright() as p: |
| browser = p.chromium.launch(headless=True) |
| context = browser.new_context( |
| user_agent=( |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
| "AppleWebKit/537.36 (KHTML, like Gecko) " |
| "Chrome/120.0.0.0 Safari/537.36" |
| ), |
| viewport={"width": 1280, "height": 800}, |
| ) |
| page = context.new_page() |
| try: |
| page.goto(url, wait_until="domcontentloaded", timeout=30000) |
| page.wait_for_timeout(2000) |
|
|
| self._expand_see_more(page) |
|
|
| if "linkedin.com" in url: |
| text = self._extract_linkedin_jd(page) |
| if text: |
| browser.close() |
| return self._clean_content(text) |
|
|
| text = page.locator("body").inner_text(timeout=10000) |
| finally: |
| browser.close() |
|
|
| return self._clean_content(text) |
|
|
| def _expand_see_more(self, page) -> None: |
| """Click any visible 'See more' / 'Show more' expand buttons.""" |
| selectors = [ |
| "button.show-more-less-button", |
| "button[aria-label*='See more']", |
| "button[aria-label*='Show more']", |
| "button:has-text('See more')", |
| "button:has-text('Show more')", |
| ] |
| for selector in selectors: |
| try: |
| for btn in page.locator(selector).all(): |
| if btn.is_visible(): |
| btn.click(timeout=2000) |
| page.wait_for_timeout(500) |
| except Exception: |
| pass |
|
|
| def _extract_linkedin_jd(self, page) -> str: |
| """Extract only the job description block from a LinkedIn page.""" |
| selectors = [ |
| ".show-more-less-html__markup", |
| ".description__text", |
| "[data-test-id='job-description']", |
| ".job-details-jobs-unified-top-card__job-description", |
| ] |
| for selector in selectors: |
| try: |
| el = page.locator(selector).first |
| if el.is_visible(timeout=2000): |
| return el.inner_text(timeout=3000) |
| except Exception: |
| pass |
| return "" |
|
|
| |
| |
| |
|
|
| def _scrape_with_requests(self, url: str) -> str: |
| headers = { |
| "User-Agent": ( |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
| "AppleWebKit/537.36 (KHTML, like Gecko) " |
| "Chrome/120.0.0.0 Safari/537.36" |
| ), |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
| "Accept-Language": "en-US,en;q=0.9", |
| } |
| response = requests.get(url, headers=headers, timeout=15) |
| soup = BeautifulSoup(response.text, "html.parser") |
| return self._clean_content(soup.get_text(" ")) |
|
|
| |
| |
| |
|
|
| def _clean_content(self, raw_text: str, max_chars: int = 5000) -> str: |
| """Remove navigation noise and truncate to control token usage.""" |
| if not raw_text: |
| return "" |
|
|
| skip_patterns = [ |
| "Sign in", "Join now", "Clear text", "Show more", "Show less", |
| "Show fewer", "Similar jobs", "People also viewed", "Similar Searches", |
| "More searches", "Agree & Join", "Set alert", "Get notified", |
| "Forgot password?", "Cookie Policy", "Privacy Policy", |
| "User Agreement", "Brand Policy", "Guest Controls", |
| "Community Guidelines", "Get the app", "Referrals increase", |
| "See who you know", "Sign in with Email", "New to LinkedIn", |
| "open jobs", "LinkedIn is better on the app", |
| "By clicking Continue", "Email or phone", "Password", |
| "Expand search", "Skip to main content", |
| ] |
| language_patterns = [ |
| "(Arabic)", "(Bangla)", "(Czech)", "(Danish)", "(German)", |
| "(Greek)", "(English)", "(Spanish)", "(Persian)", "(Finnish)", |
| "(French)", "(Hindi)", "(Hungarian)", "(Indonesian)", "(Italian)", |
| "(Hebrew)", "(Japanese)", "(Korean)", "(Marathi)", "(Malay)", |
| "(Dutch)", "(Norwegian)", "(Punjabi)", "(Polish)", "(Portuguese)", |
| "(Romanian)", "(Russian)", "(Swedish)", "(Telugu)", "(Thai)", |
| "(Tagalog)", "(Turkish)", "(Ukrainian)", "(Vietnamese)", |
| "(Chinese", "Language", |
| ] |
|
|
| lines = raw_text.split("\n") |
| cleaned = [] |
| for line in lines: |
| stripped = line.strip() |
| if not stripped or len(stripped) < 3: |
| continue |
| if any(p in stripped for p in skip_patterns): |
| continue |
| if any(p in stripped for p in language_patterns): |
| continue |
| cleaned.append(stripped) |
|
|
| result = "\n".join(cleaned) |
| |
| result = re.sub(r"\n{3,}", "\n\n", result) |
|
|
| if len(result) > max_chars: |
| result = result[:max_chars] + "\n\n[Content truncated to save tokens]" |
|
|
| return result |
|
|