Spaces:
Runtime error
Runtime error
Commit
·
d63acef
1
Parent(s):
91237b8
task: async scraper with delay
Browse files- scraper.py +7 -3
- test_scraper.ipynb +24 -17
scraper.py
CHANGED
|
@@ -33,6 +33,8 @@ async def take_screenshot(url, directory):
|
|
| 33 |
# Desktop screenshot (1920px width)
|
| 34 |
page = await browser.new_page(viewport={'width': 1920, 'height': 1080})
|
| 35 |
await page.goto(url)
|
|
|
|
|
|
|
| 36 |
# Get full height
|
| 37 |
height = await page.evaluate('document.body.scrollHeight')
|
| 38 |
await page.set_viewport_size({'width': 1920, 'height': int(height)})
|
|
@@ -41,6 +43,8 @@ async def take_screenshot(url, directory):
|
|
| 41 |
# Mobile screenshot (480px width)
|
| 42 |
page = await browser.new_page(viewport={'width': 480, 'height': 1080})
|
| 43 |
await page.goto(url)
|
|
|
|
|
|
|
| 44 |
# Get full height
|
| 45 |
height = await page.evaluate('document.body.scrollHeight')
|
| 46 |
await page.set_viewport_size({'width': 480, 'height': int(height)})
|
|
@@ -59,14 +63,14 @@ async def scrape_design(design_id):
|
|
| 59 |
|
| 60 |
# Get design page
|
| 61 |
response = requests.get(design_url)
|
| 62 |
-
print(f"Response status: {response.status_code}")
|
| 63 |
|
| 64 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 65 |
author_meta = soup.select_one('meta[name="author"]')
|
| 66 |
|
| 67 |
# Debug found elements
|
| 68 |
-
print("\nFound elements:")
|
| 69 |
-
print(f"h1: {soup.select_one('h1')
|
| 70 |
print(f"author: {author_meta['content']}")
|
| 71 |
|
| 72 |
# Extract metadata with error handling
|
|
|
|
| 33 |
# Desktop screenshot (1920px width)
|
| 34 |
page = await browser.new_page(viewport={'width': 1920, 'height': 1080})
|
| 35 |
await page.goto(url)
|
| 36 |
+
# Wait for fade transitions
|
| 37 |
+
await page.wait_for_timeout(1500)
|
| 38 |
# Get full height
|
| 39 |
height = await page.evaluate('document.body.scrollHeight')
|
| 40 |
await page.set_viewport_size({'width': 1920, 'height': int(height)})
|
|
|
|
| 43 |
# Mobile screenshot (480px width)
|
| 44 |
page = await browser.new_page(viewport={'width': 480, 'height': 1080})
|
| 45 |
await page.goto(url)
|
| 46 |
+
# Wait for fade transitions
|
| 47 |
+
await page.wait_for_timeout(1500)
|
| 48 |
# Get full height
|
| 49 |
height = await page.evaluate('document.body.scrollHeight')
|
| 50 |
await page.set_viewport_size({'width': 480, 'height': int(height)})
|
|
|
|
| 63 |
|
| 64 |
# Get design page
|
| 65 |
response = requests.get(design_url)
|
| 66 |
+
print(f"{design_id}: Response status: {response.status_code}")
|
| 67 |
|
| 68 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 69 |
author_meta = soup.select_one('meta[name="author"]')
|
| 70 |
|
| 71 |
# Debug found elements
|
| 72 |
+
print(f"{design_id}: \nFound elements:")
|
| 73 |
+
print(f"h1: {soup.select_one('h1').text}")
|
| 74 |
print(f"author: {author_meta['content']}")
|
| 75 |
|
| 76 |
# Extract metadata with error handling
|
test_scraper.ipynb
CHANGED
|
@@ -27,32 +27,39 @@
|
|
| 27 |
"name": "stdout",
|
| 28 |
"output_type": "stream",
|
| 29 |
"text": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
"Testing scraper with design 221...\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
"Success!\n"
|
| 32 |
]
|
| 33 |
-
},
|
| 34 |
-
{
|
| 35 |
-
"name": "stderr",
|
| 36 |
-
"output_type": "stream",
|
| 37 |
-
"text": [
|
| 38 |
-
"/var/folders/02/z250w46j5_514v22h_ct_zq40000gn/T/ipykernel_37704/2179274543.py:8: RuntimeWarning: coroutine 'scrape_design' was never awaited\n",
|
| 39 |
-
" scrape_design(test_design_id)\n",
|
| 40 |
-
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n"
|
| 41 |
-
]
|
| 42 |
}
|
| 43 |
],
|
| 44 |
"source": [
|
| 45 |
"from scraper import scrape_design\n",
|
|
|
|
|
|
|
|
|
|
| 46 |
"\n",
|
| 47 |
-
"
|
| 48 |
-
"test_design_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"\n",
|
| 50 |
-
"
|
| 51 |
-
" print(f\"Testing scraper with design {test_design_id}...\")\n",
|
| 52 |
-
" await scrape_design(test_design_id)\n",
|
| 53 |
-
" print(\"Success!\")\n",
|
| 54 |
-
"except Exception as e:\n",
|
| 55 |
-
" print(f\"Error: {str(e)}\")"
|
| 56 |
]
|
| 57 |
},
|
| 58 |
{
|
|
|
|
| 27 |
"name": "stdout",
|
| 28 |
"output_type": "stream",
|
| 29 |
"text": [
|
| 30 |
+
"Testing scraper with design 220...\n",
|
| 31 |
+
"Response status: 200\n",
|
| 32 |
+
"\n",
|
| 33 |
+
"Found elements:\n",
|
| 34 |
+
"h1: CSS Zen Garden\n",
|
| 35 |
+
"author: Dave Shea\n",
|
| 36 |
+
"Success!\n",
|
| 37 |
"Testing scraper with design 221...\n",
|
| 38 |
+
"Response status: 200\n",
|
| 39 |
+
"\n",
|
| 40 |
+
"Found elements:\n",
|
| 41 |
+
"h1: CSS Zen Garden\n",
|
| 42 |
+
"author: Dave Shea\n",
|
| 43 |
"Success!\n"
|
| 44 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
}
|
| 46 |
],
|
| 47 |
"source": [
|
| 48 |
"from scraper import scrape_design\n",
|
| 49 |
+
"import asyncio\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"test_design_ids = [\"220\", \"221\"]\n",
|
| 52 |
"\n",
|
| 53 |
+
"async def test_scraper(ids):\n",
|
| 54 |
+
" for test_design_id in ids:\n",
|
| 55 |
+
" try:\n",
|
| 56 |
+
" print(f\"Testing scraper with design {test_design_id}...\")\n",
|
| 57 |
+
" await scrape_design(test_design_id)\n",
|
| 58 |
+
" print(\"Success!\")\n",
|
| 59 |
+
" except Exception as e:\n",
|
| 60 |
+
" print(f\"Error: {str(e)}\")\n",
|
| 61 |
"\n",
|
| 62 |
+
"asyncio.run(test_scraper(test_design_ids))"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
]
|
| 64 |
},
|
| 65 |
{
|