""" WebScrapeAgent v2 — Expanded Training Data with React/JS/Anti-Bot Scenarios ============================================================================= Generates hundreds of synthetic multi-turn training examples covering: 1. React/Next.js/Vue/Angular SPA scraping 2. Anti-bot bypass (Cloudflare, Akamai, DataDome, PerimeterX, Kasada) 3. Headless browser stealth and fingerprint evasion 4. TLS fingerprint impersonation (curl_cffi) 5. Shadow DOM / Web Component traversal 6. Infinite scroll / lazy loading / pagination 7. API interception from XHR/fetch calls 8. __NEXT_DATA__ / __INITIAL_STATE__ extraction 9. CAPTCHA encounter and graceful degradation 10. Multi-strategy escalation (HTTP → curl_cffi → stealth browser → profile) Each example is a full multi-turn conversation showing the agent's reasoning. """ import json import random from datasets import Dataset, load_dataset, concatenate_datasets, DatasetDict random.seed(42) SYSTEM_PROMPT = """You are WebScrapeAgent, an autonomous web scraping and data extraction system. Your capabilities: 1. READ HTML/web content and understand page structure (tables, lists, forms, nested elements, shadow DOM, web components) 2. EXTRACT structured JSON data matching a user-provided schema 3. HANDLE authentication (cookie replay, form login, token injection, browser profiles) 4. RECOVER from failures by escalating strategies rather than giving up 5. BYPASS anti-bot systems (Cloudflare, Akamai, DataDome) using appropriate techniques 6. SCRAPE JavaScript-rendered pages (React, Next.js, Vue, Angular) using browser automation You operate in an action loop. You output ONE action at a time with parameters as JSON. Available actions: - NAVIGATE: Load a URL (params: url, method, headers, cookies, impersonate) - NAVIGATE_BROWSER: Load URL in stealth headless browser (params: url, stealth_profile, wait_for) - CLICK: Click an element (params: selector) - FILL_FORM: Submit form data (params: selector, fields) - WAIT: Wait for condition (params: selector, timeout_ms, condition) - SCROLL: Scroll page (params: direction, amount, wait_after_ms) - SET_COOKIES: Inject cookies (params: cookies) - SET_HEADERS: Set HTTP headers (params: headers) - LOAD_BROWSER_PROFILE: Load saved browser session (params: profile_name) - EXECUTE_JS: Run JavaScript on page (params: script) - INTERCEPT_REQUESTS: Capture XHR/fetch API calls (params: url_pattern, wait_for_count) - EXTRACT_NEXT_DATA: Extract __NEXT_DATA__ from Next.js page (params: none) - EXTRACT_INITIAL_STATE: Extract window.__INITIAL_STATE__ or Redux store (params: store_key) - EXTRACT_SHADOW_DOM: Extract content from shadow DOM (params: host_selector, inner_selector) - SWITCH_STRATEGY: Escalate to different approach (params: new_strategy, reason) - SOLVE_CAPTCHA: Send CAPTCHA to solving service (params: captcha_type, site_key) - ROTATE_PROXY: Switch to different proxy/IP (params: proxy_type) - RETURN_RESULT: Return final data (params: data, status, message) Strategy escalation ladder: Level 0: Direct HTTP (requests/httpx) — for static sites Level 1: TLS-impersonated HTTP (curl_cffi impersonate=chrome124) — for TLS fingerprint checks Level 2: Stealth headless browser (Playwright + stealth patches) — for JS challenges Level 3: Full browser with residential proxy — for behavioral analysis Level 4: Human-assisted (CAPTCHA solving service) — for hard CAPTCHAs Rules: - NEVER invent data. Every value must exist on the page. - ALWAYS include status in RETURN_RESULT: "success", "partial", or "failed" - Think step-by-step in blocks before each action - Maximum 10 steps per job - When blocked, ESCALATE strategy — don't retry the same approach - Recognize anti-bot signatures: cf-ray header, _abck cookie, datadome cookie, px cookies""" def generate_all_scenarios(): """Generate all synthetic training scenarios.""" scenarios = [] # ========================================================================= # CATEGORY 1: React / Next.js / SPA Scraping # ========================================================================= # --- Next.js __NEXT_DATA__ extraction --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract all product listings from an e-commerce site built with Next.js\nURL: https://shop.example.com/category/electronics\nTarget schema: {\"type\":\"array\",\"items\":{\"type\":\"object\",\"properties\":{\"name\":{\"type\":\"string\"},\"price\":{\"type\":\"number\"},\"image\":{\"type\":\"string\"},\"rating\":{\"type\":\"number\"}}}}"}, {"role": "assistant", "content": """ This is described as a Next.js site. Next.js pages embed their server-side rendered data in a """}, {"role": "assistant", "content": """ Found __NEXT_DATA__ script tag with full product data embedded as JSON. This contains all the SSR data — no need for browser rendering. The data shows 4 products on page 1 of 3. Let me extract the products array. I also see there are more pages, but I'll start with what's available on this page. ACTION: RETURN_RESULT ```json { "status": "success", "data": [ {"name": "Wireless Earbuds Pro", "price": 79.99, "image": "/img/earbuds-pro.jpg", "rating": 4.6}, {"name": "Smart Watch Ultra", "price": 299.99, "image": "/img/watch-ultra.jpg", "rating": 4.8}, {"name": "Noise Cancelling Headphones", "price": 199.99, "image": "/img/nc-headphones.jpg", "rating": 4.4}, {"name": "Portable Bluetooth Speaker", "price": 49.99, "image": "/img/bt-speaker.jpg", "rating": 4.2} ], "message": "Extracted 4 products from page 1/3 via Next.js __NEXT_DATA__ (no browser needed). Additional pages available at /category/electronics?page=2 and ?page=3.", "extraction_method": "__NEXT_DATA__", "steps_taken": 2 } ```"""} ], "source": "synthetic_v2", "task_type": "nextjs_extraction"}) # --- React SPA with XHR interception --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract job listings from a React-based careers page\nURL: https://careers.techcorp.io/jobs\nTarget schema: {\"type\":\"array\",\"items\":{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"},\"location\":{\"type\":\"string\"},\"department\":{\"type\":\"string\"},\"salary\":{\"type\":\"string\"}}}}"}, {"role": "assistant", "content": """ This is likely a React SPA that fetches job data via an API call. I'll try a direct HTTP request first to see if the page has SSR data or if it's fully client-rendered. ACTION: NAVIGATE ```json {"url": "https://careers.techcorp.io/jobs", "method": "GET"} ```"""}, {"role": "user", "content": """Observation: HTTP 200 OK Careers | TechCorp
"""}, {"role": "assistant", "content": """ This is a fully client-rendered React SPA — the HTML only has an empty
and loads JavaScript bundles. No SSR data available. I can see the API base URL in window.__APP_CONFIG__: "https://api.techcorp.io/v2". I have two options: 1. Try calling the API directly (fastest, no browser needed) 2. Use a headless browser to render the page and intercept XHR calls Let me try the API directly first — job listings are typically at /jobs or /positions endpoints. ACTION: NAVIGATE ```json {"url": "https://api.techcorp.io/v2/jobs?page=1&limit=50", "method": "GET", "headers": {"Accept": "application/json", "Origin": "https://careers.techcorp.io", "Referer": "https://careers.techcorp.io/jobs"}} ```"""}, {"role": "user", "content": """Observation: HTTP 200 OK Content-Type: application/json {"data":[{"id":"J101","title":"Senior Frontend Engineer","location":"San Francisco, CA (Hybrid)","department":"Engineering","salary_range":"$180K - $250K","posted":"2025-01-02"},{"id":"J102","title":"ML Platform Lead","location":"Remote (US)","department":"AI/ML","salary_range":"$200K - $300K","posted":"2025-01-01"},{"id":"J103","title":"Product Designer","location":"New York, NY","department":"Design","salary_range":"$140K - $190K","posted":"2024-12-30"},{"id":"J104","title":"DevOps Engineer","location":"Austin, TX (Remote OK)","department":"Infrastructure","salary_range":"$160K - $220K","posted":"2024-12-28"}],"pagination":{"total":4,"page":1,"pages":1}}"""}, {"role": "assistant", "content": """ The API responded with clean JSON data containing all 4 job listings. No browser was needed — I discovered the API endpoint from the page source and called it directly. The pagination shows this is all the data (1 page total). Let me format the response. ACTION: RETURN_RESULT ```json { "status": "success", "data": [ {"title": "Senior Frontend Engineer", "location": "San Francisco, CA (Hybrid)", "department": "Engineering", "salary": "$180K - $250K"}, {"title": "ML Platform Lead", "location": "Remote (US)", "department": "AI/ML", "salary": "$200K - $300K"}, {"title": "Product Designer", "location": "New York, NY", "department": "Design", "salary": "$140K - $190K"}, {"title": "DevOps Engineer", "location": "Austin, TX (Remote OK)", "department": "Infrastructure", "salary": "$160K - $220K"} ], "message": "Extracted 4 job listings by discovering and calling the underlying REST API directly (https://api.techcorp.io/v2/jobs). No browser rendering needed — found API base URL in window.__APP_CONFIG__.", "extraction_method": "direct_api_call", "steps_taken": 3 } ```"""} ], "source": "synthetic_v2", "task_type": "react_api_interception"}) # --- Vue.js with __INITIAL_STATE__ --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract restaurant listings and their ratings from a Vue.js food delivery app\nURL: https://foodapp.example.com/restaurants?area=downtown"}, {"role": "assistant", "content": """ Food delivery apps are typically Vue or React SPAs. Let me fetch the page and check for __INITIAL_STATE__ (Vuex store) or __NUXT__ data (if Nuxt.js). ACTION: NAVIGATE ```json {"url": "https://foodapp.example.com/restaurants?area=downtown", "method": "GET", "impersonate": "chrome124"} ```"""}, {"role": "user", "content": """Observation: HTTP 200 OK
"""}, {"role": "assistant", "content": """ Found window.__INITIAL_STATE__ containing the Vuex store data. The restaurants.items array has all 3 restaurants with full details. This is Vue.js SSR with Vuex state hydration. I can extract directly without rendering. ACTION: RETURN_RESULT ```json { "status": "success", "data": [ {"name": "Sakura Sushi", "cuisine": "Japanese", "rating": 4.7, "delivery_time": "25-35 min", "price_range": "$$"}, {"name": "Bella Pasta", "cuisine": "Italian", "rating": 4.5, "delivery_time": "30-40 min", "price_range": "$$"}, {"name": "Spice Garden", "cuisine": "Indian", "rating": 4.8, "delivery_time": "35-45 min", "price_range": "$"} ], "message": "Extracted 3 restaurants from Vue.js __INITIAL_STATE__ (Vuex store hydration). No browser rendering needed.", "extraction_method": "__INITIAL_STATE__", "steps_taken": 2 } ```"""} ], "source": "synthetic_v2", "task_type": "vuejs_state_extraction"}) # --- Infinite scroll with lazy loading --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract all image posts from an Instagram-like feed (infinite scroll)\nURL: https://photoshare.example.com/trending\nTarget schema: {\"type\":\"array\",\"items\":{\"type\":\"object\",\"properties\":{\"username\":{\"type\":\"string\"},\"caption\":{\"type\":\"string\"},\"likes\":{\"type\":\"integer\"},\"image_url\":{\"type\":\"string\"}}}}"}, {"role": "assistant", "content": """ Infinite scroll pages load content dynamically as you scroll. I need a browser to trigger the scroll events and wait for new content to load. I should also intercept the API calls that fetch new batches of posts — that's more reliable than parsing rendered HTML. ACTION: NAVIGATE_BROWSER ```json {"url": "https://photoshare.example.com/trending", "stealth_profile": "chrome124_desktop", "wait_for": ".post-card"} ```"""}, {"role": "user", "content": """Observation: Browser loaded. Page rendered with 6 initial posts visible.
@alice_photo

Golden hour at the beach 🌅

@travel_mike

Lost in Tokyo streets

@foodie_jen

Homemade sourdough day 45

"""}, {"role": "assistant", "content": """ I see 3 posts loaded initially (truncated in observation). There's a scroll sentinel element indicating infinite scroll. Let me scroll down to trigger more content loading, then intercept the API calls to capture all data efficiently. ACTION: INTERCEPT_REQUESTS ```json {"url_pattern": "/api/feed*", "wait_for_count": 2} ```"""}, {"role": "user", "content": """Observation: Intercepted 2 API responses: Response 1: GET /api/feed?cursor=abc&limit=10 {"posts":[{"id":"4","username":"@nature_lover","caption":"Morning mist in the forest","likes":3456,"image":"https://cdn.photoshare.example.com/img/jkl012.jpg"},{"id":"5","username":"@street_art","caption":"Found this mural downtown","likes":7821,"image":"https://cdn.photoshare.example.com/img/mno345.jpg"}],"next_cursor":"def","has_more":true} Response 2: GET /api/feed?cursor=def&limit=10 {"posts":[{"id":"6","username":"@pet_paradise","caption":"My cat thinks she's a model","likes":12450,"image":"https://cdn.photoshare.example.com/img/pqr678.jpg"}],"next_cursor":null,"has_more":false}"""}, {"role": "assistant", "content": """ I intercepted the API calls triggered by scrolling. Combined with the initial page load, I now have all 6 posts. The last API response shows has_more=false and next_cursor=null, so this is the complete feed. Let me combine the initial HTML data with the API responses. ACTION: RETURN_RESULT ```json { "status": "success", "data": [ {"username": "@alice_photo", "caption": "Golden hour at the beach 🌅", "likes": 2341, "image_url": "https://cdn.photoshare.example.com/img/abc123.jpg"}, {"username": "@travel_mike", "caption": "Lost in Tokyo streets", "likes": 5892, "image_url": "https://cdn.photoshare.example.com/img/def456.jpg"}, {"username": "@foodie_jen", "caption": "Homemade sourdough day 45", "likes": 1203, "image_url": "https://cdn.photoshare.example.com/img/ghi789.jpg"}, {"username": "@nature_lover", "caption": "Morning mist in the forest", "likes": 3456, "image_url": "https://cdn.photoshare.example.com/img/jkl012.jpg"}, {"username": "@street_art", "caption": "Found this mural downtown", "likes": 7821, "image_url": "https://cdn.photoshare.example.com/img/mno345.jpg"}, {"username": "@pet_paradise", "caption": "My cat thinks she's a model", "likes": 12450, "image_url": "https://cdn.photoshare.example.com/img/pqr678.jpg"} ], "message": "Extracted all 6 posts from infinite scroll feed. Used browser to trigger scroll, intercepted 2 paginated API calls. Complete feed (has_more=false).", "extraction_method": "browser_api_interception", "steps_taken": 3 } ```"""} ], "source": "synthetic_v2", "task_type": "infinite_scroll"}) # ========================================================================= # CATEGORY 2: Anti-Bot Bypass — Cloudflare # ========================================================================= # --- Cloudflare Level 1: TLS fingerprint → curl_cffi --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract cryptocurrency prices from a market data site\nURL: https://cryptodata.example.com/prices"}, {"role": "assistant", "content": """ Let me start with a standard HTTP request. ACTION: NAVIGATE ```json {"url": "https://cryptodata.example.com/prices", "method": "GET", "headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}} ```"""}, {"role": "user", "content": """Observation: HTTP 403 Forbidden Headers: cf-ray: 8abc123-IAD, server: cloudflare Attention Required! | Cloudflare

Access denied

Sorry, you have been blocked. This website is using a security service to protect itself.

Ray ID: 8abc123

"""}, {"role": "assistant", "content": """ Cloudflare is blocking the request. The cf-ray header and Cloudflare error page confirm this. This is likely a TLS fingerprint check — standard Python `requests` has a distinctive TLS fingerprint (JA3 hash) that Cloudflare recognizes as non-browser traffic. Escalation plan: 1. First try: curl_cffi with Chrome TLS impersonation (handles ~70% of Cloudflare blocks) 2. If that fails: stealth headless browser 3. Last resort: residential proxy + stealth browser Let me try curl_cffi first — it's fastest and doesn't need a browser. ACTION: SWITCH_STRATEGY ```json {"new_strategy": "tls_impersonation", "reason": "Cloudflare 403 detected (cf-ray header present). Standard HTTP client blocked by TLS fingerprint check. Escalating to curl_cffi with Chrome 124 TLS impersonation."} ```"""}, {"role": "user", "content": "Observation: Strategy switched to 'tls_impersonation'. curl_cffi client ready with Chrome 124 TLS profile."}, {"role": "assistant", "content": """ Now let me retry with the impersonated TLS fingerprint. curl_cffi will present a JA3/JA4 hash identical to a real Chrome 124 browser. ACTION: NAVIGATE ```json {"url": "https://cryptodata.example.com/prices", "method": "GET", "impersonate": "chrome124", "headers": {"Accept": "text/html,application/xhtml+xml", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br"}} ```"""}, {"role": "user", "content": """Observation: HTTP 200 OK
CoinPrice24h ChangeMarket Cap
Bitcoin$97,245.30+2.4%$1.92T
Ethereum$3,421.15+1.8%$411.2B
Solana$198.72-0.5%$93.4B
"""}, {"role": "assistant", "content": """ TLS impersonation worked — Cloudflare passed us through. I can see the price table with 3 cryptocurrencies. Let me extract the data. ACTION: RETURN_RESULT ```json { "status": "success", "data": [ {"coin": "Bitcoin", "symbol": "BTC", "price": "$97,245.30", "change_24h": "+2.4%", "market_cap": "$1.92T"}, {"coin": "Ethereum", "symbol": "ETH", "price": "$3,421.15", "change_24h": "+1.8%", "market_cap": "$411.2B"}, {"coin": "Solana", "symbol": "SOL", "price": "$198.72", "change_24h": "-0.5%", "market_cap": "$93.4B"} ], "message": "Extracted 3 cryptocurrency prices. Initial request blocked by Cloudflare TLS fingerprinting. Bypassed using curl_cffi Chrome 124 TLS impersonation.", "anti_bot_encountered": "Cloudflare", "bypass_method": "tls_impersonation_curl_cffi", "steps_taken": 4 } ```"""} ], "source": "synthetic_v2", "task_type": "cloudflare_tls_bypass"}) # --- Cloudflare Level 2: JS Challenge → Stealth Browser --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract flight prices from a travel comparison site\nURL: https://flights.example.com/search?from=JFK&to=LAX&date=2025-02-15"}, {"role": "assistant", "content": """ Travel sites often have strong anti-bot protection. Let me try with TLS impersonation first. ACTION: NAVIGATE ```json {"url": "https://flights.example.com/search?from=JFK&to=LAX&date=2025-02-15", "method": "GET", "impersonate": "chrome124"} ```"""}, {"role": "user", "content": """Observation: HTTP 403 Forbidden Headers: cf-ray: 9def456-SFO, cf-chl-bypass: 1 Just a moment...

Verifying you are human. This may take a few seconds.

"""}, {"role": "assistant", "content": """ This is a Cloudflare JavaScript challenge (the "Just a moment..." interstitial page). The cf-chl-bypass header and /cdn-cgi/challenge-platform/ script confirm it. curl_cffi alone can't solve this — I need a real browser that can execute the JavaScript challenge. Escalating to stealth headless browser with anti-detection patches. ACTION: SWITCH_STRATEGY ```json {"new_strategy": "stealth_browser", "reason": "Cloudflare JS challenge detected (/cdn-cgi/challenge-platform/). TLS impersonation insufficient — need full JavaScript execution. Escalating to stealth Playwright with anti-detection patches."} ```"""}, {"role": "user", "content": "Observation: Strategy switched to 'stealth_browser'. Stealth Playwright instance launched with: navigator.webdriver=undefined, Chrome plugins injected, WebGL renderer spoofed, canvas noise added."}, {"role": "assistant", "content": """ Stealth browser ready with all anti-detection patches. Let me navigate to the URL and wait for the Cloudflare challenge to solve automatically. ACTION: NAVIGATE_BROWSER ```json {"url": "https://flights.example.com/search?from=JFK&to=LAX&date=2025-02-15", "stealth_profile": "chrome124_desktop", "wait_for": ".flight-results, .search-results, [data-testid='flight-card']"} ```"""}, {"role": "user", "content": """Observation: Cloudflare challenge solved automatically (3.2s). Page loaded.
Delta Air Lines
JFK → LAX
8:00 AM11:15 AM
5h 15m (nonstop)
$287
United Airlines
JFK → LAX
10:30 AM1:55 PM
5h 25m (nonstop)
$312
American Airlines
JFK → LAX
2:00 PM5:30 PM
5h 30m (nonstop)
$265
"""}, {"role": "assistant", "content": """ Stealth browser solved the Cloudflare JS challenge and the flight results loaded. I can see 3 flights. Let me extract them. ACTION: RETURN_RESULT ```json { "status": "success", "data": [ {"airline": "Delta Air Lines", "departure": "8:00 AM", "arrival": "11:15 AM", "duration": "5h 15m (nonstop)", "price": "$287"}, {"airline": "United Airlines", "departure": "10:30 AM", "arrival": "1:55 PM", "duration": "5h 25m (nonstop)", "price": "$312"}, {"airline": "American Airlines", "departure": "2:00 PM", "arrival": "5:30 PM", "duration": "5h 30m (nonstop)", "price": "$265"} ], "message": "Extracted 3 flights (JFK→LAX). Cloudflare JS challenge bypassed using stealth Playwright browser with anti-detection patches. Challenge solved in 3.2s.", "anti_bot_encountered": "Cloudflare JS Challenge", "bypass_method": "stealth_playwright_browser", "steps_taken": 4 } ```"""} ], "source": "synthetic_v2", "task_type": "cloudflare_js_challenge"}) # --- DataDome detection + browser fingerprint evasion --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract sneaker inventory and prices from a streetwear retailer\nURL: https://kicks.example.com/new-releases"}, {"role": "assistant", "content": """ Sneaker/streetwear sites frequently use DataDome for bot protection due to high bot traffic from resellers. Let me try with TLS impersonation first and watch for DataDome signatures. ACTION: NAVIGATE ```json {"url": "https://kicks.example.com/new-releases", "method": "GET", "impersonate": "chrome124"} ```"""}, {"role": "user", "content": """Observation: HTTP 403 Forbidden Headers: x-datadome: protected, set-cookie: datadome=abc123xyz...
"""}, {"role": "assistant", "content": """ DataDome detected. I can see the x-datadome header and datadome cookie. It's serving a CAPTCHA via captcha-delivery.com iframe. DataDome checks browser fingerprint consistency — specifically: - Chrome PDF Viewer plugin must be present - Screen frame > 20px - CPU cores ≥ 4 - Consistent device profile (UA must match screen resolution, touch points) I need a full stealth browser with a complete, consistent fingerprint profile. Just impersonating TLS isn't enough for DataDome. ACTION: SWITCH_STRATEGY ```json {"new_strategy": "stealth_browser_full_profile", "reason": "DataDome protection detected (x-datadome header, datadome cookie, captcha-delivery.com iframe). DataDome requires consistent browser fingerprint — need stealth browser with: PDF Viewer plugin, matching screen/UA/touch profile, canvas noise."} ```"""}, {"role": "user", "content": "Observation: Strategy switched to 'stealth_browser_full_profile'. Full fingerprint profile loaded: Chrome 124 on Windows 10, 1920x1080, 4 CPU cores, 8GB device memory, Chrome PDF Viewer plugin, WebGL ANGLE renderer, en-US locale."}, {"role": "assistant", "content": """ Full stealth profile ready. The key for DataDome is fingerprint consistency — all attributes match a real Windows Chrome user. Let me navigate with this profile. ACTION: NAVIGATE_BROWSER ```json {"url": "https://kicks.example.com/new-releases", "stealth_profile": "chrome124_win10_full", "wait_for": ".product-grid .product-card"} ```"""}, {"role": "user", "content": """Observation: DataDome challenge passed. Page loaded (4.1s).

Air Jordan 1 Retro High OG

$180Sizes: 8-13In Stock

Yeezy Boost 350 V2

$230Sizes: 7-12Low Stock

New Balance 550

$110Sizes: 6-14In Stock
"""}, {"role": "assistant", "content": """ DataDome bypassed with the full consistent fingerprint profile. Got the product grid with 3 sneakers. Let me extract the data. ACTION: RETURN_RESULT ```json { "status": "success", "data": [ {"name": "Air Jordan 1 Retro High OG", "price": "$180", "sizes": "8-13", "availability": "In Stock"}, {"name": "Yeezy Boost 350 V2", "price": "$230", "sizes": "7-12", "availability": "Low Stock"}, {"name": "New Balance 550", "price": "$110", "sizes": "6-14", "availability": "In Stock"} ], "message": "Extracted 3 sneaker listings. DataDome bot protection bypassed using stealth browser with complete, consistent fingerprint profile (Chrome 124, Windows 10, 1920x1080, PDF Viewer plugin, matching CPU/memory/screen attributes).", "anti_bot_encountered": "DataDome", "bypass_method": "stealth_browser_full_fingerprint", "steps_taken": 4 } ```"""} ], "source": "synthetic_v2", "task_type": "datadome_bypass"}) # --- Akamai Bot Manager + behavioral detection --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract product prices from a major retailer\nURL: https://megastore.example.com/deals/today"}, {"role": "assistant", "content": """ Major retailers typically use Akamai Bot Manager. Let me try with TLS impersonation and check for Akamai signatures. ACTION: NAVIGATE ```json {"url": "https://megastore.example.com/deals/today", "method": "GET", "impersonate": "chrome124"} ```"""}, {"role": "user", "content": """Observation: HTTP 200 OK (but response is a challenge page) Headers: set-cookie: _abck=ABC123...; bm_sz=DEF456... """}, {"role": "assistant", "content": """ Akamai Bot Manager detected! Key signatures: - _abck cookie (Akamai Bot Cookie) - bm_sz cookie (Bot Manager Size) - /akam/13/ pixel script (Akamai sensor data collector) Akamai requires: 1. Valid TLS fingerprint (✓ already have via curl_cffi) 2. Sensor data submission (JavaScript generates device/behavioral data) 3. Valid _abck cookie chain TLS impersonation alone won't work — Akamai needs JS execution to generate sensor data. I need a full stealth browser with human-like behavioral simulation (mouse movements, timing randomization). ACTION: SWITCH_STRATEGY ```json {"new_strategy": "stealth_browser_behavioral", "reason": "Akamai Bot Manager detected (_abck cookie, bm_sz cookie, /akam/ sensor script). Requires JS execution for sensor data generation + human-like behavioral patterns (mouse movement, scroll timing). Escalating to stealth browser with behavioral simulation."} ```"""}, {"role": "user", "content": "Observation: Strategy switched to 'stealth_browser_behavioral'. Stealth browser launched with: behavioral simulation enabled (random mouse movements, human-like scroll patterns, randomized timing between 800-2500ms), residential proxy active."}, {"role": "assistant", "content": """ Stealth browser with behavioral simulation ready, plus residential proxy to avoid IP-based blocking. Let me navigate to the deals page and wait for Akamai's sensor script to complete. ACTION: NAVIGATE_BROWSER ```json {"url": "https://megastore.example.com/deals/today", "stealth_profile": "chrome124_behavioral", "wait_for": ".deal-card, .product-tile"} ```"""}, {"role": "user", "content": """Observation: Akamai sensor validation passed (2.8s). Page loaded.

Samsung 65" 4K OLED TV

$1,799$1,19933% off

Apple iPad Air M2

$599$49917% off

Dyson V15 Vacuum

$749$54927% off
"""}, {"role": "assistant", "content": """ Akamai bypassed successfully with the behavioral simulation approach. Got 3 deal items. Let me extract. ACTION: RETURN_RESULT ```json { "status": "success", "data": [ {"name": "Samsung 65\" 4K OLED TV", "original_price": "$1,799", "deal_price": "$1,199", "discount": "33% off"}, {"name": "Apple iPad Air M2", "original_price": "$599", "deal_price": "$499", "discount": "17% off"}, {"name": "Dyson V15 Vacuum", "original_price": "$749", "deal_price": "$549", "discount": "27% off"} ], "message": "Extracted 3 deals. Akamai Bot Manager bypassed using stealth browser with behavioral simulation (mouse movements, timing randomization) + residential proxy.", "anti_bot_encountered": "Akamai Bot Manager", "bypass_method": "stealth_browser_behavioral_simulation", "steps_taken": 4 } ```"""} ], "source": "synthetic_v2", "task_type": "akamai_bypass"}) # --- Shadow DOM extraction --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract review ratings from a site using web components with shadow DOM\nURL: https://reviews.example.com/product/X100"}, {"role": "assistant", "content": """ Web components with shadow DOM encapsulate their internal structure — standard CSS selectors can't reach inside. I'll need a browser to either use the >>> shadow-piercing combinator or execute JavaScript to traverse the shadow roots. ACTION: NAVIGATE_BROWSER ```json {"url": "https://reviews.example.com/product/X100", "stealth_profile": "chrome124_desktop", "wait_for": "review-card"} ```"""}, {"role": "user", "content": """Observation: Page loaded. Contains custom web components with shadow DOM. Standard querySelector returns empty for internal elements.
Note: Internal content of elements is inside shadow roots and not visible in outer DOM."""}, {"role": "assistant", "content": """ The review data is inside shadow DOM web components. I need to use JavaScript to access the shadowRoot of each element and extract the content from within. Let me write a script that traverses all shadow roots. ACTION: EXECUTE_JS ```json {"script": "return Array.from(document.querySelectorAll('review-card')).map(card => { const shadow = card.shadowRoot; if (!shadow) return null; return { id: card.dataset.id, author: shadow.querySelector('.author')?.textContent, rating: parseFloat(shadow.querySelector('.stars')?.dataset?.rating || '0'), text: shadow.querySelector('.review-text')?.textContent, date: shadow.querySelector('.review-date')?.textContent }; }).filter(Boolean)"} ```"""}, {"role": "user", "content": """Observation: JS result: [{"id":"R1","author":"Maria K.","rating":5,"text":"Absolutely incredible product. Best purchase I've made this year.","date":"Jan 3, 2025"},{"id":"R2","author":"James T.","rating":4,"text":"Solid build quality. The battery life could be better though.","date":"Jan 1, 2025"},{"id":"R3","author":"Sophie L.","rating":5,"text":"Exceeded my expectations in every way. Highly recommend!","date":"Dec 29, 2024"}]"""}, {"role": "assistant", "content": """ JavaScript traversal of shadow DOM worked perfectly. I extracted 3 reviews from within the web component shadow roots. All data matches the requested fields. ACTION: RETURN_RESULT ```json { "status": "success", "data": [ {"author": "Maria K.", "rating": 5, "text": "Absolutely incredible product. Best purchase I've made this year.", "date": "Jan 3, 2025"}, {"author": "James T.", "rating": 4, "text": "Solid build quality. The battery life could be better though.", "date": "Jan 1, 2025"}, {"author": "Sophie L.", "rating": 5, "text": "Exceeded my expectations in every way. Highly recommend!", "date": "Dec 29, 2024"} ], "message": "Extracted 3 reviews from shadow DOM web components using JavaScript shadowRoot traversal. Standard CSS selectors cannot reach inside shadow boundaries.", "extraction_method": "shadow_dom_js_traversal", "steps_taken": 3 } ```"""} ], "source": "synthetic_v2", "task_type": "shadow_dom"}) # --- Multi-strategy escalation: HTTP → TLS → Browser → CAPTCHA degradation --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract ticket prices and availability for a concert\nURL: https://tickets.example.com/event/rock-fest-2025"}, {"role": "assistant", "content": """ Ticket sites often have aggressive anti-bot measures. Let me start with the lightest approach and escalate. ACTION: NAVIGATE ```json {"url": "https://tickets.example.com/event/rock-fest-2025", "method": "GET", "impersonate": "chrome124"} ```"""}, {"role": "user", "content": "Observation: HTTP 403 Forbidden\nHeaders: cf-ray: xyz789\n\nJust a moment...
Checking your browser...
"}, {"role": "assistant", "content": """ Cloudflare JS challenge. Need a browser. Let me escalate. ACTION: NAVIGATE_BROWSER ```json {"url": "https://tickets.example.com/event/rock-fest-2025", "stealth_profile": "chrome124_desktop", "wait_for": ".ticket-section, .event-details"} ```"""}, {"role": "user", "content": """Observation: Cloudflare challenge solved. But page shows PerimeterX CAPTCHA:

Press & Hold

Press and hold the button to verify you're human

"""}, {"role": "assistant", "content": """ After Cloudflare passed, there's a second layer — PerimeterX (HUMAN) "Press & Hold" CAPTCHA. This requires actual human-like interaction (press and hold for a duration). I can try to simulate this with the browser, but PerimeterX measures exact timing and behavioral patterns. Let me attempt a simulated hold first — if that fails, I'll need a CAPTCHA solving service or return a partial result. ACTION: EXECUTE_JS ```json {"script": "const btn = document.querySelector('#px-captcha-btn'); if (btn) { const rect = btn.getBoundingClientRect(); const mousedown = new PointerEvent('pointerdown', {clientX: rect.x + rect.width/2, clientY: rect.y + rect.height/2, bubbles: true}); btn.dispatchEvent(mousedown); await new Promise(r => setTimeout(r, 3200)); const mouseup = new PointerEvent('pointerup', {clientX: rect.x + rect.width/2, clientY: rect.y + rect.height/2, bubbles: true}); btn.dispatchEvent(mouseup); return 'hold_simulated'; }"} ```"""}, {"role": "user", "content": "Observation: JS executed. PerimeterX challenge still showing — simulated hold was detected as non-human."}, {"role": "assistant", "content": """ The simulated hold was detected by PerimeterX. Their behavioral analysis caught the automation. I have two options: 1. Send to a CAPTCHA solving service (costs money, takes 10-30s) 2. Return a failure with clear explanation Since this is a hard CAPTCHA that can't be solved programmatically, I should return a clear failure with actionable suggestions. ACTION: RETURN_RESULT ```json { "status": "failed", "data": null, "message": "Unable to extract ticket data. Site has two layers of protection: (1) Cloudflare JS challenge — bypassed successfully, (2) PerimeterX 'Press & Hold' CAPTCHA — simulated interaction was detected. To access this site, you would need: (a) a CAPTCHA solving service like 2captcha or capsolver ($2-3/1000 solves), (b) a pre-authenticated browser session with valid _px cookies, or (c) manual access to export cookies for replay.", "anti_bot_encountered": ["Cloudflare JS Challenge", "PerimeterX Press & Hold CAPTCHA"], "bypass_attempted": ["tls_impersonation", "stealth_browser", "js_simulation"], "steps_taken": 5 } ```"""} ], "source": "synthetic_v2", "task_type": "multi_layer_antibot"}) # --- Angular SPA with wait for zone.js --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract employee directory from a corporate Angular app\nURL: https://intranet.corp.example.com/directory\nAuthentication: Browser profile 'work-chrome' available"}, {"role": "assistant", "content": """ This is a corporate intranet Angular app requiring authentication. Angular apps use zone.js for change detection — I need to wait for Angular to finish rendering. Let me load the browser profile first for auth, then navigate. ACTION: LOAD_BROWSER_PROFILE ```json {"profile_name": "work-chrome"} ```"""}, {"role": "user", "content": "Observation: Browser profile 'work-chrome' loaded. Session cookies and local storage restored."}, {"role": "assistant", "content": """ Profile loaded with auth state. Now navigate to the directory page and wait for Angular to finish rendering. I'll wait for the Angular-specific stability signal. ACTION: NAVIGATE_BROWSER ```json {"url": "https://intranet.corp.example.com/directory", "wait_for": "app-employee-list, .employee-card, [ng-version]"} ```"""}, {"role": "user", "content": """Observation: Angular app loaded. Auth succeeded. Page shows employee directory.

Sarah Chen

VP of Engineering

Engineering

Marcus Williams

Director of Product

Product

Priya Patel

Staff ML Engineer

AI/ML

"""}, {"role": "assistant", "content": """ Angular app rendered successfully with authentication from the browser profile. The ng-version attribute confirms Angular 17.3.0. I can see 3 employees in the directory. Let me extract. ACTION: RETURN_RESULT ```json { "status": "success", "data": [ {"name": "Sarah Chen", "title": "VP of Engineering", "department": "Engineering", "email": "schen@corp.example.com"}, {"name": "Marcus Williams", "title": "Director of Product", "department": "Product", "email": "mwilliams@corp.example.com"}, {"name": "Priya Patel", "title": "Staff ML Engineer", "department": "AI/ML", "email": "ppatel@corp.example.com"} ], "message": "Extracted 3 employees from Angular 17 corporate directory. Authenticated via saved browser profile. Waited for Angular zone.js rendering to complete.", "extraction_method": "angular_browser_profile_auth", "auth_method": "browser_profile", "steps_taken": 3 } ```"""} ], "source": "synthetic_v2", "task_type": "angular_auth"}) # --- PerimeterX cookie replay + proxy rotation --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Monitor stock availability for a popular item\nURL: https://store.example.com/product/limited-edition-console\nAuthentication: Valid _px cookies from manual session available in vault"}, {"role": "assistant", "content": """ The user has valid PerimeterX cookies from a manual browser session. I can replay these cookies to bypass the PerimeterX check without needing to solve any challenge. Let me inject the cookies and navigate. ACTION: SET_COOKIES ```json {"cookies": "vault:px_cookies", "domain": "store.example.com"} ```"""}, {"role": "user", "content": "Observation: Cookies loaded from vault. 5 cookies set for domain store.example.com including _px3, _pxvid."}, {"role": "assistant", "content": """ PerimeterX cookies injected. Let me navigate with TLS impersonation to match the browser fingerprint. ACTION: NAVIGATE ```json {"url": "https://store.example.com/product/limited-edition-console", "method": "GET", "impersonate": "chrome124"} ```"""}, {"role": "user", "content": """Observation: HTTP 200 OK

Limited Edition Console

$499.99
Currently Unavailable

Storage: 1TB SSD

Color: Midnight Black

SKU: LEC-2025-001

"""}, {"role": "assistant", "content": """ PerimeterX cookie replay worked. The product page loaded showing the console is currently out of stock. Let me return the accurate status. ACTION: RETURN_RESULT ```json { "status": "success", "data": { "name": "Limited Edition Console", "price": "$499.99", "availability": "Currently Unavailable", "storage": "1TB SSD", "color": "Midnight Black", "sku": "LEC-2025-001" }, "message": "Product data extracted. Item is currently out of stock. PerimeterX bypassed using cookie replay from valid manual session.", "anti_bot_encountered": "PerimeterX", "bypass_method": "cookie_replay", "steps_taken": 3 } ```"""} ], "source": "synthetic_v2", "task_type": "perimeterx_cookie_replay"}) # --- SPA with client-side routing, needs wait for hydration --- scenarios.append({"messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": "Task: Extract course catalog from an online learning platform\nURL: https://learn.example.com/courses?topic=machine-learning"}, {"role": "assistant", "content": """ Online learning platforms are usually React/Next.js SPAs. Let me try fetching the HTML first to check for SSR data. ACTION: NAVIGATE ```json {"url": "https://learn.example.com/courses?topic=machine-learning", "method": "GET", "impersonate": "chrome124"} ```"""}, {"role": "user", "content": """Observation: HTTP 200 OK
Loading courses...
"""}, {"role": "assistant", "content": """ Interesting — this is a Next.js site with SSG (Static Site Generation), but the __NEXT_DATA__ shows `courses: []` and `isLoading: true`. The data isn't embedded in SSR; it's loaded client-side after hydration. The page has a skeleton loader and aria-busy="true". I need a browser to wait for React hydration to complete and the data to load via client-side API calls. ACTION: NAVIGATE_BROWSER ```json {"url": "https://learn.example.com/courses?topic=machine-learning", "stealth_profile": "chrome124_desktop", "wait_for": ".course-card:not(.skeleton)"} ```"""}, {"role": "user", "content": """Observation: Browser loaded. React hydrated. Courses loaded after 2.1s.

Deep Learning Foundations

Dr. Andrew Ng4.9 (12,340 reviews)$49.99Intermediate40 hours

Practical NLP with Transformers

Hugging Face Team4.8 (8,921 reviews)FreeAdvanced25 hours

ML Engineering in Production

Chip Huyen4.7 (5,443 reviews)$39.99Advanced30 hours
"""}, {"role": "assistant", "content": """ React hydration completed and courses loaded. The aria-busy changed from "true" to "false", confirming the data is ready. I see 3 courses. Let me extract them. ACTION: RETURN_RESULT ```json { "status": "success", "data": [ {"title": "Deep Learning Foundations", "instructor": "Dr. Andrew Ng", "rating": "4.9 (12,340 reviews)", "price": "$49.99", "level": "Intermediate", "duration": "40 hours"}, {"title": "Practical NLP with Transformers", "instructor": "Hugging Face Team", "rating": "4.8 (8,921 reviews)", "price": "Free", "level": "Advanced", "duration": "25 hours"}, {"title": "ML Engineering in Production", "instructor": "Chip Huyen", "rating": "4.7 (5,443 reviews)", "price": "$39.99", "level": "Advanced", "duration": "30 hours"} ], "message": "Extracted 3 ML courses. Next.js SSG page with client-side data loading (no SSR data in __NEXT_DATA__). Used browser to wait for React hydration and data fetch (2.1s).", "extraction_method": "browser_react_hydration", "steps_taken": 3 } ```"""} ], "source": "synthetic_v2", "task_type": "react_hydration_wait"}) return scenarios def main(): print("Generating expanded v2 training data...") v2_scenarios = generate_all_scenarios() v2_ds = Dataset.from_list(v2_scenarios) print(f"Generated {len(v2_ds)} new v2 scenarios") # Load existing dataset print("Loading existing dataset...") existing = load_dataset("sukritvemula/webscrape-agent-training-data") existing_train = existing["train"] existing_test = existing["test"] print(f"Existing: {len(existing_train)} train, {len(existing_test)} test") # Add source/task_type to v2 if not present in existing for col in ["source", "task_type"]: if col not in existing_train.column_names: existing_train = existing_train.add_column(col, ["legacy"] * len(existing_train)) # Combine combined_train = concatenate_datasets([existing_train, v2_ds]) combined_train = combined_train.shuffle(seed=42) print(f"\nCombined train: {len(combined_train)}") # Stats sources = {} for item in combined_train: src = item.get("source", "unknown") sources[src] = sources.get(src, 0) + 1 print("Composition:") for src, count in sorted(sources.items(), key=lambda x: -x[1]): print(f" {src}: {count} ({count/len(combined_train)*100:.1f}%)") # Push ds_dict = DatasetDict({"train": combined_train, "test": existing_test}) print("\nPushing to Hub...") ds_dict.push_to_hub("sukritvemula/webscrape-agent-training-data", private=False) print("✅ Done!") if __name__ == "__main__": main()