NeerajCodz commited on
Commit
8512126
Β·
1 Parent(s): fa81e4d

test: add end-to-end scraper test script with successful validation

Browse files
backend/app/api/routes/__pycache__/episode.cpython-314.pyc CHANGED
Binary files a/backend/app/api/routes/__pycache__/episode.cpython-314.pyc and b/backend/app/api/routes/__pycache__/episode.cpython-314.pyc differ
 
backend/app/models/__pycache__/router.cpython-314.pyc CHANGED
Binary files a/backend/app/models/__pycache__/router.cpython-314.pyc and b/backend/app/models/__pycache__/router.cpython-314.pyc differ
 
backend/app/models/providers/__pycache__/__init__.cpython-314.pyc CHANGED
Binary files a/backend/app/models/providers/__pycache__/__init__.cpython-314.pyc and b/backend/app/models/providers/__pycache__/__init__.cpython-314.pyc differ
 
backend/app/models/providers/__pycache__/google.cpython-314.pyc CHANGED
Binary files a/backend/app/models/providers/__pycache__/google.cpython-314.pyc and b/backend/app/models/providers/__pycache__/google.cpython-314.pyc differ
 
backend/app/models/providers/__pycache__/groq.cpython-314.pyc CHANGED
Binary files a/backend/app/models/providers/__pycache__/groq.cpython-314.pyc and b/backend/app/models/providers/__pycache__/groq.cpython-314.pyc differ
 
backend/test_scraper.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test script to verify scraper functionality end-to-end."""
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ from typing import Any
7
+
8
+ import httpx
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ BASE_URL = "http://localhost:8000/api"
14
+
15
+
16
+ async def test_scraper():
17
+ """Test the scraper with a simple task."""
18
+ async with httpx.AsyncClient(timeout=60.0) as client:
19
+ # Step 1: Create a scraping task
20
+ logger.info("Creating scraping episode...")
21
+
22
+ reset_payload = {
23
+ "task_id": "test-scrape-quotes",
24
+ "seed": 42,
25
+ "config": {
26
+ "start_url": "http://quotes.toscrape.com",
27
+ "target_fields": {
28
+ "quotes": {
29
+ "text": "quote text",
30
+ "author": "quote author",
31
+ "tags": "quote tags"
32
+ }
33
+ },
34
+ "max_steps": 20,
35
+ "timeout": 300
36
+ }
37
+ }
38
+
39
+ try:
40
+ response = await client.post(f"{BASE_URL}/episode/reset", json=reset_payload)
41
+ response.raise_for_status()
42
+ reset_data = response.json()
43
+ episode_id = reset_data["episode_id"]
44
+ logger.info(f"βœ“ Episode created: {episode_id}")
45
+ logger.info(f" Initial observation: {reset_data['observation']['current_url']}")
46
+
47
+ except Exception as e:
48
+ logger.error(f"βœ— Failed to create episode: {e}")
49
+ if hasattr(e, 'response'):
50
+ logger.error(f" Response: {e.response.text}")
51
+ return
52
+
53
+ # Step 2: Execute a few actions
54
+ actions = [
55
+ {
56
+ "action_type": "navigate",
57
+ "parameters": {"url": "http://quotes.toscrape.com"},
58
+ "reasoning": "Navigate to the quotes website to start scraping",
59
+ },
60
+ {
61
+ "action_type": "extract_field",
62
+ "parameters": {
63
+ "field_name": "quotes",
64
+ "css_selector": ".quote .text"
65
+ },
66
+ "reasoning": "Extract all quotes from the page",
67
+ },
68
+ {
69
+ "action_type": "done",
70
+ "parameters": {"success": True},
71
+ "reasoning": "Extraction complete",
72
+ }
73
+ ]
74
+
75
+ for i, action_data in enumerate(actions, 1):
76
+ logger.info(f"\nStep {i}: {action_data['action_type']}")
77
+
78
+ step_payload = {
79
+ "episode_id": episode_id,
80
+ "action": action_data
81
+ }
82
+
83
+ try:
84
+ response = await client.post(f"{BASE_URL}/episode/step", json=step_payload)
85
+ response.raise_for_status()
86
+ step_data = response.json()
87
+
88
+ logger.info(f"βœ“ Action executed successfully")
89
+ logger.info(f" Reward: {step_data['reward']:.2f}")
90
+ logger.info(f" Progress: {step_data['observation'].get('extraction_progress', 0):.1f}%")
91
+ logger.info(f" Terminated: {step_data['terminated']}")
92
+
93
+ if step_data.get('reward_breakdown'):
94
+ logger.info(f" Reward breakdown:")
95
+ for key, value in step_data['reward_breakdown'].items():
96
+ if isinstance(value, (int, float)):
97
+ logger.info(f" {key}: {value:.2f}")
98
+
99
+ if step_data['terminated'] or step_data['truncated']:
100
+ logger.info("\nEpisode finished!")
101
+ break
102
+
103
+ except Exception as e:
104
+ logger.error(f"βœ— Step {i} failed: {e}")
105
+ if hasattr(e, 'response'):
106
+ logger.error(f" Response: {e.response.text}")
107
+ break
108
+
109
+ # Step 3: Get final state
110
+ logger.info("\n" + "="*60)
111
+ logger.info("Fetching final episode state...")
112
+
113
+ try:
114
+ response = await client.get(f"{BASE_URL}/episode/state/{episode_id}")
115
+ response.raise_for_status()
116
+ state_data = response.json()
117
+
118
+ logger.info(f"βœ“ Final state retrieved")
119
+ logger.info(f" Episode ID: {state_data.get('episode_id', 'N/A')}")
120
+ logger.info(f" Steps: {state_data.get('step_number', 0)}")
121
+ logger.info(f" Total reward: {state_data.get('total_reward', 0.0):.2f}")
122
+ logger.info(f" Terminal: {state_data.get('is_terminal', False)}")
123
+ logger.info(f" Extracted data: {json.dumps(state_data.get('extracted_data', {}), indent=2)}")
124
+
125
+ except Exception as e:
126
+ logger.error(f"βœ— Failed to get state: {e}")
127
+
128
+
129
+ async def test_websocket():
130
+ """Test WebSocket connectivity (just connect, not full test)."""
131
+ logger.info("\n" + "="*60)
132
+ logger.info("Testing WebSocket endpoint...")
133
+
134
+ try:
135
+ # Just verify the endpoint exists
136
+ async with httpx.AsyncClient() as client:
137
+ response = await client.get("http://localhost:8000/docs")
138
+ if response.status_code == 200:
139
+ logger.info("βœ“ API docs accessible at http://localhost:8000/docs")
140
+ logger.info(" WebSocket endpoint: ws://localhost:8000/ws/episode/{episode_id}")
141
+ except Exception as e:
142
+ logger.error(f"βœ— Failed to check docs: {e}")
143
+
144
+
145
+ async def main():
146
+ """Run all tests."""
147
+ logger.info("="*60)
148
+ logger.info("ScrapeRL End-to-End Test")
149
+ logger.info("="*60)
150
+
151
+ await test_scraper()
152
+ await test_websocket()
153
+
154
+ logger.info("\n" + "="*60)
155
+ logger.info("Testing complete!")
156
+ logger.info("="*60)
157
+
158
+
159
+ if __name__ == "__main__":
160
+ asyncio.run(main())