File size: 5,862 Bytes
8512126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""Test script to verify scraper functionality end-to-end."""

import asyncio
import json
import logging
from typing import Any

import httpx

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

BASE_URL = "http://localhost:8000/api"


async def test_scraper():
    """Test the scraper with a simple task."""
    async with httpx.AsyncClient(timeout=60.0) as client:
        # Step 1: Create a scraping task
        logger.info("Creating scraping episode...")
        
        reset_payload = {
            "task_id": "test-scrape-quotes",
            "seed": 42,
            "config": {
                "start_url": "http://quotes.toscrape.com",
                "target_fields": {
                    "quotes": {
                        "text": "quote text",
                        "author": "quote author",
                        "tags": "quote tags"
                    }
                },
                "max_steps": 20,
                "timeout": 300
            }
        }
        
        try:
            response = await client.post(f"{BASE_URL}/episode/reset", json=reset_payload)
            response.raise_for_status()
            reset_data = response.json()
            episode_id = reset_data["episode_id"]
            logger.info(f"βœ“ Episode created: {episode_id}")
            logger.info(f"  Initial observation: {reset_data['observation']['current_url']}")
            
        except Exception as e:
            logger.error(f"βœ— Failed to create episode: {e}")
            if hasattr(e, 'response'):
                logger.error(f"  Response: {e.response.text}")
            return
        
        # Step 2: Execute a few actions
        actions = [
            {
                "action_type": "navigate",
                "parameters": {"url": "http://quotes.toscrape.com"},
                "reasoning": "Navigate to the quotes website to start scraping",
            },
            {
                "action_type": "extract_field",
                "parameters": {
                    "field_name": "quotes",
                    "css_selector": ".quote .text"
                },
                "reasoning": "Extract all quotes from the page",
            },
            {
                "action_type": "done",
                "parameters": {"success": True},
                "reasoning": "Extraction complete",
            }
        ]
        
        for i, action_data in enumerate(actions, 1):
            logger.info(f"\nStep {i}: {action_data['action_type']}")
            
            step_payload = {
                "episode_id": episode_id,
                "action": action_data
            }
            
            try:
                response = await client.post(f"{BASE_URL}/episode/step", json=step_payload)
                response.raise_for_status()
                step_data = response.json()
                
                logger.info(f"βœ“ Action executed successfully")
                logger.info(f"  Reward: {step_data['reward']:.2f}")
                logger.info(f"  Progress: {step_data['observation'].get('extraction_progress', 0):.1f}%")
                logger.info(f"  Terminated: {step_data['terminated']}")
                
                if step_data.get('reward_breakdown'):
                    logger.info(f"  Reward breakdown:")
                    for key, value in step_data['reward_breakdown'].items():
                        if isinstance(value, (int, float)):
                            logger.info(f"    {key}: {value:.2f}")
                
                if step_data['terminated'] or step_data['truncated']:
                    logger.info("\nEpisode finished!")
                    break
                    
            except Exception as e:
                logger.error(f"βœ— Step {i} failed: {e}")
                if hasattr(e, 'response'):
                    logger.error(f"  Response: {e.response.text}")
                break
        
        # Step 3: Get final state
        logger.info("\n" + "="*60)
        logger.info("Fetching final episode state...")
        
        try:
            response = await client.get(f"{BASE_URL}/episode/state/{episode_id}")
            response.raise_for_status()
            state_data = response.json()
            
            logger.info(f"βœ“ Final state retrieved")
            logger.info(f"  Episode ID: {state_data.get('episode_id', 'N/A')}")
            logger.info(f"  Steps: {state_data.get('step_number', 0)}")
            logger.info(f"  Total reward: {state_data.get('total_reward', 0.0):.2f}")
            logger.info(f"  Terminal: {state_data.get('is_terminal', False)}")
            logger.info(f"  Extracted data: {json.dumps(state_data.get('extracted_data', {}), indent=2)}")
            
        except Exception as e:
            logger.error(f"βœ— Failed to get state: {e}")


async def test_websocket():
    """Test WebSocket connectivity (just connect, not full test)."""
    logger.info("\n" + "="*60)
    logger.info("Testing WebSocket endpoint...")
    
    try:
        # Just verify the endpoint exists
        async with httpx.AsyncClient() as client:
            response = await client.get("http://localhost:8000/docs")
            if response.status_code == 200:
                logger.info("βœ“ API docs accessible at http://localhost:8000/docs")
                logger.info("  WebSocket endpoint: ws://localhost:8000/ws/episode/{episode_id}")
    except Exception as e:
        logger.error(f"βœ— Failed to check docs: {e}")


async def main():
    """Run all tests."""
    logger.info("="*60)
    logger.info("ScrapeRL End-to-End Test")
    logger.info("="*60)
    
    await test_scraper()
    await test_websocket()
    
    logger.info("\n" + "="*60)
    logger.info("Testing complete!")
    logger.info("="*60)


if __name__ == "__main__":
    asyncio.run(main())