Spaces:
Runtime error
Runtime error
Commit
·
21dc073
1
Parent(s):
9cee9a1
task: adds scraper wait time to load background images
Browse files- data_collection/notebook.ipynb +5 -62
- data_collection/scraper.py +8 -10
data_collection/notebook.ipynb
CHANGED
|
@@ -13,72 +13,15 @@
|
|
| 13 |
},
|
| 14 |
{
|
| 15 |
"cell_type": "code",
|
| 16 |
-
"execution_count":
|
| 17 |
"metadata": {},
|
| 18 |
"outputs": [
|
| 19 |
{
|
| 20 |
"name": "stdout",
|
| 21 |
"output_type": "stream",
|
| 22 |
"text": [
|
| 23 |
-
"Testing scraper with design
|
| 24 |
-
"
|
| 25 |
-
"Success!\n",
|
| 26 |
-
"Testing scraper with design 002...\n",
|
| 27 |
-
"002: Response status: 200\n",
|
| 28 |
-
"Success!\n",
|
| 29 |
-
"Testing scraper with design 003...\n",
|
| 30 |
-
"003: Response status: 200\n",
|
| 31 |
-
"Success!\n",
|
| 32 |
-
"Testing scraper with design 004...\n",
|
| 33 |
-
"004: Response status: 200\n",
|
| 34 |
-
"Success!\n",
|
| 35 |
-
"Testing scraper with design 005...\n",
|
| 36 |
-
"005: Response status: 200\n",
|
| 37 |
-
"Success!\n",
|
| 38 |
-
"Testing scraper with design 006...\n",
|
| 39 |
-
"006: Response status: 200\n",
|
| 40 |
-
"Success!\n",
|
| 41 |
-
"Testing scraper with design 007...\n",
|
| 42 |
-
"007: Response status: 200\n",
|
| 43 |
-
"Success!\n",
|
| 44 |
-
"Testing scraper with design 008...\n",
|
| 45 |
-
"008: Response status: 200\n",
|
| 46 |
-
"Success!\n",
|
| 47 |
-
"Testing scraper with design 009...\n",
|
| 48 |
-
"009: Response status: 200\n",
|
| 49 |
-
"Success!\n",
|
| 50 |
-
"Testing scraper with design 010...\n",
|
| 51 |
-
"010: Response status: 200\n",
|
| 52 |
-
"Success!\n",
|
| 53 |
-
"Testing scraper with design 011...\n",
|
| 54 |
-
"011: Response status: 200\n",
|
| 55 |
-
"Success!\n",
|
| 56 |
-
"Testing scraper with design 012...\n",
|
| 57 |
-
"012: Response status: 200\n",
|
| 58 |
-
"Success!\n",
|
| 59 |
-
"Testing scraper with design 013...\n",
|
| 60 |
-
"013: Response status: 200\n",
|
| 61 |
-
"Success!\n",
|
| 62 |
-
"Testing scraper with design 014...\n",
|
| 63 |
-
"014: Response status: 200\n",
|
| 64 |
-
"Success!\n",
|
| 65 |
-
"Testing scraper with design 015...\n",
|
| 66 |
-
"015: Response status: 200\n",
|
| 67 |
-
"Success!\n",
|
| 68 |
-
"Testing scraper with design 016...\n",
|
| 69 |
-
"016: Response status: 200\n",
|
| 70 |
-
"Success!\n",
|
| 71 |
-
"Testing scraper with design 017...\n",
|
| 72 |
-
"017: Response status: 200\n",
|
| 73 |
-
"Success!\n",
|
| 74 |
-
"Testing scraper with design 018...\n",
|
| 75 |
-
"018: Response status: 200\n",
|
| 76 |
-
"Success!\n",
|
| 77 |
-
"Testing scraper with design 019...\n",
|
| 78 |
-
"019: Response status: 200\n",
|
| 79 |
-
"Success!\n",
|
| 80 |
-
"Testing scraper with design 020...\n",
|
| 81 |
-
"020: Response status: 200\n",
|
| 82 |
"Success!\n"
|
| 83 |
]
|
| 84 |
}
|
|
@@ -88,8 +31,8 @@
|
|
| 88 |
"import asyncio\n",
|
| 89 |
"\n",
|
| 90 |
"# Pick from a range of 001 to 221\n",
|
| 91 |
-
"test_set = [f\"{i:03d}\" for i in range(1, 21)]\n",
|
| 92 |
-
"
|
| 93 |
"async def test_scraper(ids):\n",
|
| 94 |
" for test_design_id in ids:\n",
|
| 95 |
" try:\n",
|
|
|
|
| 13 |
},
|
| 14 |
{
|
| 15 |
"cell_type": "code",
|
| 16 |
+
"execution_count": 5,
|
| 17 |
"metadata": {},
|
| 18 |
"outputs": [
|
| 19 |
{
|
| 20 |
"name": "stdout",
|
| 21 |
"output_type": "stream",
|
| 22 |
"text": [
|
| 23 |
+
"Testing scraper with design 112...\n",
|
| 24 |
+
"112: Response status: 200\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"Success!\n"
|
| 26 |
]
|
| 27 |
}
|
|
|
|
| 31 |
"import asyncio\n",
|
| 32 |
"\n",
|
| 33 |
"# Pick from a range of 001 to 221\n",
|
| 34 |
+
"#test_set = [f\"{i:03d}\" for i in range(1, 21)]\n",
|
| 35 |
+
"test_set = [\"112\"]\n",
|
| 36 |
"async def test_scraper(ids):\n",
|
| 37 |
" for test_design_id in ids:\n",
|
| 38 |
" try:\n",
|
data_collection/scraper.py
CHANGED
|
@@ -31,27 +31,25 @@ async def take_screenshot(url, directory):
|
|
| 31 |
browser = await p.chromium.launch()
|
| 32 |
|
| 33 |
# Desktop screenshot (1920px width)
|
| 34 |
-
page = await browser.new_page(viewport={'width':
|
| 35 |
await page.goto(url)
|
| 36 |
# Wait for network to be idle (no requests for at least 500ms)
|
| 37 |
-
await page.wait_for_load_state()
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
#await page.wait_for_timeout(2000) # 2 second delay
|
| 41 |
|
| 42 |
# Get full height
|
| 43 |
height = await page.evaluate('document.body.scrollHeight')
|
| 44 |
-
await page.set_viewport_size({'width':
|
| 45 |
await page.screenshot(path=f"{directory}/screenshot_desktop.png", full_page=True)
|
| 46 |
|
| 47 |
# Mobile screenshot (480px width)
|
| 48 |
page = await browser.new_page(viewport={'width': 480, 'height': 1080})
|
| 49 |
await page.goto(url)
|
| 50 |
# Wait for network to be idle (no requests for at least 500ms)
|
| 51 |
-
await page.wait_for_load_state()
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
#await page.wait_for_timeout(2000) # 2 second delay
|
| 55 |
|
| 56 |
# Get full height
|
| 57 |
height = await page.evaluate('document.body.scrollHeight')
|
|
|
|
| 31 |
browser = await p.chromium.launch()
|
| 32 |
|
| 33 |
# Desktop screenshot (1920px width)
|
| 34 |
+
page = await browser.new_page(viewport={'width': 1600, 'height': 1080})
|
| 35 |
await page.goto(url)
|
| 36 |
# Wait for network to be idle (no requests for at least 500ms)
|
| 37 |
+
await page.wait_for_load_state("networkidle")
|
| 38 |
+
# Add a significant delay to ensure background images are loaded
|
| 39 |
+
await page.wait_for_timeout(2000)
|
|
|
|
| 40 |
|
| 41 |
# Get full height
|
| 42 |
height = await page.evaluate('document.body.scrollHeight')
|
| 43 |
+
await page.set_viewport_size({'width': 1600, 'height': int(height)})
|
| 44 |
await page.screenshot(path=f"{directory}/screenshot_desktop.png", full_page=True)
|
| 45 |
|
| 46 |
# Mobile screenshot (480px width)
|
| 47 |
page = await browser.new_page(viewport={'width': 480, 'height': 1080})
|
| 48 |
await page.goto(url)
|
| 49 |
# Wait for network to be idle (no requests for at least 500ms)
|
| 50 |
+
await page.wait_for_load_state("networkidle")
|
| 51 |
+
# Add a significant delay to ensure background images are loaded
|
| 52 |
+
await page.wait_for_timeout(2000)
|
|
|
|
| 53 |
|
| 54 |
# Get full height
|
| 55 |
height = await page.evaluate('document.body.scrollHeight')
|