Spaces:

Chirag20
/

cvmatcher

Sleeping

File size: 6,918 Bytes

ec55f11

import re
import requests
from bs4 import BeautifulSoup
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
from typing import Type


class WebScraperInput(BaseModel):
    website_url: str = Field(description="The URL of the website to scrape")


class CleanWebScraperTool(BaseTool):
    """Scrapes a website using a Playwright headless browser so JavaScript
    renders and 'See more' buttons get clicked before extraction.
    Falls back to a plain requests fetch if Playwright is unavailable."""

    name: str = "Scrape and Clean Website"
    description: str = (
        "Scrapes a website URL using a headless browser, expands collapsed "
        "sections (e.g. LinkedIn 'See more'), and returns cleaned text content."
    )
    args_schema: Type[BaseModel] = WebScraperInput

    def _run(self, website_url: str) -> str:
        try:
            return self._scrape_with_playwright(website_url)
        except Exception:
            try:
                return self._scrape_with_requests(website_url)
            except Exception as e:
                return f"Error scraping website: {e}"

    # ------------------------------------------------------------------
    # Playwright path
    # ------------------------------------------------------------------

    def _scrape_with_playwright(self, url: str) -> str:
        from playwright.sync_api import sync_playwright

        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            context = browser.new_context(
                user_agent=(
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/120.0.0.0 Safari/537.36"
                ),
                viewport={"width": 1280, "height": 800},
            )
            page = context.new_page()
            try:
                page.goto(url, wait_until="domcontentloaded", timeout=30000)
                page.wait_for_timeout(2000)

                self._expand_see_more(page)

                if "linkedin.com" in url:
                    text = self._extract_linkedin_jd(page)
                    if text:
                        browser.close()
                        return self._clean_content(text)

                text = page.locator("body").inner_text(timeout=10000)
            finally:
                browser.close()

        return self._clean_content(text)

    def _expand_see_more(self, page) -> None:
        """Click any visible 'See more' / 'Show more' expand buttons."""
        selectors = [
            "button.show-more-less-button",
            "button[aria-label*='See more']",
            "button[aria-label*='Show more']",
            "button:has-text('See more')",
            "button:has-text('Show more')",
        ]
        for selector in selectors:
            try:
                for btn in page.locator(selector).all():
                    if btn.is_visible():
                        btn.click(timeout=2000)
                        page.wait_for_timeout(500)
            except Exception:
                pass

    def _extract_linkedin_jd(self, page) -> str:
        """Extract only the job description block from a LinkedIn page."""
        selectors = [
            ".show-more-less-html__markup",
            ".description__text",
            "[data-test-id='job-description']",
            ".job-details-jobs-unified-top-card__job-description",
        ]
        for selector in selectors:
            try:
                el = page.locator(selector).first
                if el.is_visible(timeout=2000):
                    return el.inner_text(timeout=3000)
            except Exception:
                pass
        return ""

    # ------------------------------------------------------------------
    # Requests fallback
    # ------------------------------------------------------------------

    def _scrape_with_requests(self, url: str) -> str:
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/120.0.0.0 Safari/537.36"
            ),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
        }
        response = requests.get(url, headers=headers, timeout=15)
        soup = BeautifulSoup(response.text, "html.parser")
        return self._clean_content(soup.get_text(" "))

    # ------------------------------------------------------------------
    # Shared cleaning
    # ------------------------------------------------------------------

    def _clean_content(self, raw_text: str, max_chars: int = 5000) -> str:
        """Remove navigation noise and truncate to control token usage."""
        if not raw_text:
            return ""

        skip_patterns = [
            "Sign in", "Join now", "Clear text", "Show more", "Show less",
            "Show fewer", "Similar jobs", "People also viewed", "Similar Searches",
            "More searches", "Agree & Join", "Set alert", "Get notified",
            "Forgot password?", "Cookie Policy", "Privacy Policy",
            "User Agreement", "Brand Policy", "Guest Controls",
            "Community Guidelines", "Get the app", "Referrals increase",
            "See who you know", "Sign in with Email", "New to LinkedIn",
            "open jobs", "LinkedIn is better on the app",
            "By clicking Continue", "Email or phone", "Password",
            "Expand search", "Skip to main content",
        ]
        language_patterns = [
            "(Arabic)", "(Bangla)", "(Czech)", "(Danish)", "(German)",
            "(Greek)", "(English)", "(Spanish)", "(Persian)", "(Finnish)",
            "(French)", "(Hindi)", "(Hungarian)", "(Indonesian)", "(Italian)",
            "(Hebrew)", "(Japanese)", "(Korean)", "(Marathi)", "(Malay)",
            "(Dutch)", "(Norwegian)", "(Punjabi)", "(Polish)", "(Portuguese)",
            "(Romanian)", "(Russian)", "(Swedish)", "(Telugu)", "(Thai)",
            "(Tagalog)", "(Turkish)", "(Ukrainian)", "(Vietnamese)",
            "(Chinese", "Language",
        ]

        lines = raw_text.split("\n")
        cleaned = []
        for line in lines:
            stripped = line.strip()
            if not stripped or len(stripped) < 3:
                continue
            if any(p in stripped for p in skip_patterns):
                continue
            if any(p in stripped for p in language_patterns):
                continue
            cleaned.append(stripped)

        result = "\n".join(cleaned)
        # Collapse runs of blank lines left after filtering
        result = re.sub(r"\n{3,}", "\n\n", result)

        if len(result) > max_chars:
            result = result[:max_chars] + "\n\n[Content truncated to save tokens]"

        return result