api-web-crawler / app /util /browser_agent.py
mrfirdauss's picture
fix update content
5705468
import asyncio
import os
import tempfile
from playwright.async_api import async_playwright, TimeoutError
from pydantic import BaseModel, Field
from typing import Optional, List
from bs4 import BeautifulSoup
import pymupdf
from urllib.parse import urljoin, urlparse, urldefrag
from playwright_stealth import Stealth
from enum import StrEnum
class Status(StrEnum):
RELEVANT = "RELEVANT"
IRRELEVANT = "IRRELEVANT"
FAILED = "FAILED"
class LinkOverview(BaseModel):
summary: str = Field(..., description="A brief summary of the link's content, maximum 1 paragraph.")
SLA: Optional[str] = Field(None, description="Service Level Agreement in days of visa, if mentioned.")
required_docs: Optional[str] = Field(None, description="List of required documents as text, if mentioned.")
price: Optional[str] = Field(None, description="Price or fee information, if mentioned.")
details: Optional[str] = Field(None, description="Additional details about the link's content. 3-5 paragraphs max.")
status: Status = Field(Status.FAILED, description="Overall status of the analysis.")
class LinkNode(BaseModel):
href: str = Field(..., description="The URL of the link")
overview: LinkOverview = Field(..., description="Summary and details about the link's content")
parent: Optional[str] = Field(None, description="The parent link, where this link was found (source).")
child: List[str] = Field(..., description="List of links found on this page")
depth: int = Field(..., description="Depth level in the link hierarchy (0=root, 1=child of root, etc.)")
raw_text: Optional[str] = None # Field to store scraped text before analysis
class BrowserAgent:
def __init__(self, model: any, max_depth: int = 2):
self.model = model
self.max_depth = max_depth
self.link_map = {}
self.browser = None
self.context = None
self.stealth_manager = None
async def __aenter__(self):
"""Initializes the browser using the new Stealth context manager pattern."""
self.stealth_manager = Stealth().use_async(async_playwright())
self.p = await self.stealth_manager.__aenter__()
self.browser = await self.p.chromium.launch(
headless=True,
args=[
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process'
]
)
self.context = await self.browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 1080},
locale='en-US',
timezone_id='America/New_York'
)
print("🚀 Browser agent initialized with Stealth API.")
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Closes the browser and cleans up all resources in reverse order."""
if self.context: await self.context.close()
if self.browser: await self.browser.close()
if self.stealth_manager: await self.stealth_manager.__aexit__(exc_type, exc_val, exc_tb)
print("✅ Browser agent shut down gracefully.")
async def run(self, start_url: str):
"""Public method to start the full process: crawl then analyze."""
if not start_url.startswith("http"):
start_url = "http://" + start_url
print(f"Starting crawl from: {start_url}")
await self._explore(url=start_url, depth=0, parent_url=None)
print("\n--- Crawl Complete. Starting AI Analysis ---")
await self.analyze_map()
return self.link_map
async def _explore(self, url: str, depth: int, parent_url: Optional[str]):
"""Recursively scrapes text and finds links, without calling the LLM."""
url = urldefrag(url).url
if url in self.link_map or depth > self.max_depth:
return
print(f"Scraping URL: {url} (Depth: {depth})")
# Initialize with a default 'FAILED' status, which will be updated upon success
overview = LinkOverview(summary="Pending analysis...", status=Status.FAILED)
self.link_map[url] = LinkNode(href=url, overview=overview, parent=parent_url, child=[], depth=depth)
page = await self.context.new_page()
await page.add_init_script("""
// Override the navigator.webdriver property
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
// Override chrome property
window.chrome = {
runtime: {}
};
// Override permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
""")
try:
content, soup, is_pdf = await self._get_page_content(page, url)
if content is not None:
self.link_map[url].raw_text = content
# The status is still pending; the AI will determine RELEVANT/IRRELEVANT later.
# We can leave the default status as FAILED for now, as it indicates a failure in the *overall* process until the AI succeeds.
is_relevant_for_crawl = "visa" in content.lower()
if not is_pdf and is_relevant_for_crawl:
child_links = self._find_child_links(soup, url)
self.link_map[url].child = child_links
tasks = [self._explore(link, depth + 1, url) for link in child_links if link not in self.link_map]
if tasks:
await asyncio.gather(*tasks)
else:
# Content retrieval failed, so we finalize the status as FAILED.
self.link_map[url].overview.summary = "Failed to retrieve or process page content."
self.link_map[url].overview.status = Status.FAILED # Explicitly confirm failure
finally:
await page.close()
async def analyze_map(self):
"""Iterates through the completed map and sends content to the LLM for analysis."""
tasks = []
for url, node in self.link_map.items():
if node.raw_text and node.overview.summary == "Pending analysis...":
tasks.append(self.analyze_node(url))
if tasks:
print(f"Found {len(tasks)} pages to analyze with the LLM...")
await asyncio.gather(*tasks)
async def analyze_node(self, url: str):
"""Helper function to analyze a single node."""
print(f" Analyzing content for: {url}")
node = self.link_map[url]
overview = await self._analyze_content(node.raw_text)
node.overview = overview
node.raw_text = None # Optional: clear text after analysis to save memory
async def _get_page_content(self, page, url: str):
"""Navigates to a URL and extracts its text content, handling various scenarios."""
NAVIGATION_TIMEOUT_MS = 60000 # Increased timeout
try:
page_text = ""
soup = None
is_pdf = False
# Navigate with a longer timeout and wait for domcontentloaded first
response = await page.goto(url, wait_until="domcontentloaded", timeout=NAVIGATION_TIMEOUT_MS)
if not response:
raise Exception("No response from server.")
content_type = response.headers.get("content-type", "").lower()
if "application/pdf" in content_type:
is_pdf = True
print("-> PDF detected (in-browser viewer)...")
pdf_bytes = await response.body()
doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")
for p in doc:
page_text += p.get_text() + "\n"
doc.close()
else:
print("-> HTML detected...")
try:
# Wait for network to be idle
await page.wait_for_load_state("networkidle", timeout=15000)
except TimeoutError:
print(f" Warning: Timed out waiting for 'networkidle' on {url}.")
# Additional wait for JavaScript to execute
try:
await page.wait_for_load_state("load", timeout=10000)
except TimeoutError:
print(f" Warning: Timed out waiting for 'load' state on {url}.")
# Give extra time for dynamic content to render
await asyncio.sleep(2)
# Try to wait for body content to be present
try:
await page.wait_for_selector('body', timeout=5000)
except TimeoutError:
print(f" Warning: Body selector not found on {url}.")
# Check if there's a "JavaScript required" message
content_check = await page.content()
if "javascript" in content_check.lower() and "enable" in content_check.lower():
print(f" Warning: Page may require JavaScript to be enabled: {url}")
# Try scrolling to trigger lazy-loaded content
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await asyncio.sleep(1)
await page.evaluate("window.scrollTo(0, 0)")
await asyncio.sleep(1)
page_content = await page.content()
soup = BeautifulSoup(page_content, 'html.parser')
# Remove unwanted elements
for el in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
el.decompose()
page_text = soup.get_text(separator=' ', strip=True)
# If page text is suspiciously short, it might be a JS-heavy page
if len(page_text.strip()) < 100:
print(f" Warning: Very little text content found ({len(page_text)} chars). Page may be JS-dependent.")
# Try getting inner text via JavaScript
try:
js_text = await page.evaluate("document.body.innerText")
if len(js_text) > len(page_text):
print(f" -> Using JavaScript-extracted text instead ({len(js_text)} chars)")
page_text = js_text
except Exception as e:
print(f" -> Could not extract text via JavaScript: {e}")
return page_text, soup, is_pdf
except Exception as e:
if "Download is starting" in str(e):
print(f"-> Page forced a download for {url}. Capturing the file...")
is_pdf = True
download = await page.wait_for_event("download", timeout=NAVIGATION_TIMEOUT_MS)
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
await download.save_as(tmp_file.name)
tmp_file_path = tmp_file.name
with open(tmp_file_path, "rb") as f:
pdf_bytes = f.read()
os.unlink(tmp_file_path)
doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")
for p in doc:
page_text += p.get_text() + "\n"
doc.close()
return page_text, None, True
else:
print(f" Error: Failed to get content for {url}. Reason: {e}")
return None, None, None
def _find_child_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
"""Finds, filters, and resolves all valid child links on a page across any domain."""
links = []
for link in soup.find_all('a', href=True):
href = link['href'].strip()
if href.lower().startswith(('mailto:', 'tel:')):
continue
absolute_url = urljoin(base_url, href)
absolute_url = urldefrag(absolute_url).url
if absolute_url.startswith(('http://', 'https://')):
links.append(absolute_url)
return list(set(links))
async def _analyze_content(self, page_text: str) -> LinkOverview:
"""Sends page text to the LLM for structured analysis."""
llm_prompt = f"""
You are an expert visa information analyst. Your task is to meticulously analyze the following web page content (`page_text`) and extract all visa-related information into a structured format based on the provided `LinkOverview` schema.
**Core Directives:**
1. **Comprehensive Analysis:** If the text describes multiple visa types (e.g., Tourist, Business, Student), you must identify and describe each one completely. Do not merge or generalize information.
2. **Self-Contained Output:** The generated response must be exhaustive and self-contained. Extract all relevant details so the user does not need to visit the original webpage. Never instruct the user to "visit the link for more details."
3. **Complete Data Extraction:** For *every* visa type mentioned, you must extract its specific Service Level Agreement (SLA), price, and a full list of required documents. If information for a specific field is not mentioned for a visa type, explicitly state that.
**Field-Specific Formatting and Instructions:**
* **`summary`**: Provide a brief overview of all the visa services described on the page.
* **`details`**: If there are multiple visa types, use this field to provide a detailed breakdown for each one. Use clear headings for each visa type (e.g., "**Tourist Visa (Subclass 600)**").
* **`SLA`**: Clearly list the processing time for each visa type mentioned. For example: "Tourist Visa: 15-20 business days; Business Visa: 10-15 business days."
* **`price`**: Clearly list the fees for each visa type mentioned. For example: "Tourist Visa: $150; Business Visa: $180."
* **`required_docs`**: This is a critical field. The information must be concise, clear, and follow the exact format below. Synthesize conditions and exceptions into the bullet points.
* Do not just copy-paste text. Summarize requirements intelligently (e.g., specify if documents need translation, if physical copies are needed, or note exceptions for minors).
* Use this strict format:
```text
**[Visa Type Name 1]**
Required:
- Passport with at least 6 months validity
- Physical bank statement from the last 3 months (must be translated if not in English)
- Signed consent letter from both parents (for applicants under 18 traveling alone)
Optional:
- Hotel booking confirmation
- Travel insurance
**[Visa Type Name 2]**
Required:
- [Document 1 with specific conditions]
- [Document 2 with specific conditions]
Optional:
- [Optional Document 1]
```
* **`status`**: This is crucial. If unable to load the complete info from the page, including JS enabled or timeout issue, set to `'FAILED'`.
Set to `'RELEVANT'` if the page contains any visa-related information.
If the page is loaded completely and unrelated to visas (e.g., a privacy policy, a different product), set this to `'IRRELEVANT'`. If `'IRRELEVANT'`, you can leave other fields empty.
**Analyze the following web page content and generate the structured data:**
{page_text}"""
try:
print(f" Sending {len(page_text)} chars to GenAI for analysis...")
# This is where you call your actual AI model client
llm_result = await self.model.formated_prompt(
prompt=llm_prompt,
response_schema=LinkOverview
)
if llm_result and llm_result.get("parsed"):
print(" GenAI analysis successful.")
# Pydantic automatically validates and converts the string 'RELEVANT' to Status.RELEVANT
overview = LinkOverview.model_validate(llm_result["parsed"])
else:
print(" Warning: GenAI call succeeded but returned no structured data.")
overview = LinkOverview(
summary="Content analysis failed: The AI model returned an empty or unparsable response.",
status=Status.FAILED
)
except Exception as e:
print(f"Error in GenAI structured extraction: {e}")
overview = LinkOverview(
summary=f"Content analysis failed with an error: {e}",
status=Status.FAILED
)
return overview