Spaces:
Runtime error
Runtime error
Commit
·
17bef4b
1
Parent(s):
e2784d5
task: ensure design scraper captures images
Browse files- scraper.py +34 -11
scraper.py
CHANGED
|
@@ -33,8 +33,23 @@ async def take_screenshot(url, directory):
|
|
| 33 |
# Desktop screenshot (1920px width)
|
| 34 |
page = await browser.new_page(viewport={'width': 1920, 'height': 1080})
|
| 35 |
await page.goto(url)
|
| 36 |
-
# Wait for
|
| 37 |
-
await page.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
# Get full height
|
| 39 |
height = await page.evaluate('document.body.scrollHeight')
|
| 40 |
await page.set_viewport_size({'width': 1920, 'height': int(height)})
|
|
@@ -43,8 +58,23 @@ async def take_screenshot(url, directory):
|
|
| 43 |
# Mobile screenshot (480px width)
|
| 44 |
page = await browser.new_page(viewport={'width': 480, 'height': 1080})
|
| 45 |
await page.goto(url)
|
| 46 |
-
# Wait for
|
| 47 |
-
await page.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
# Get full height
|
| 49 |
height = await page.evaluate('document.body.scrollHeight')
|
| 50 |
await page.set_viewport_size({'width': 480, 'height': int(height)})
|
|
@@ -66,18 +96,11 @@ async def scrape_design(design_id):
|
|
| 66 |
print(f"{design_id}: Response status: {response.status_code}")
|
| 67 |
|
| 68 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 69 |
-
author_meta = soup.select_one('meta[name="author"]')
|
| 70 |
-
|
| 71 |
-
# Debug found elements
|
| 72 |
-
print(f"{design_id}: \nFound elements:")
|
| 73 |
-
print(f"h1: {soup.select_one('h1').text}")
|
| 74 |
-
print(f"author: {author_meta['content']}")
|
| 75 |
|
| 76 |
# Extract metadata with error handling
|
| 77 |
try:
|
| 78 |
metadata = {
|
| 79 |
"id": design_id,
|
| 80 |
-
"author": author_meta["content"] if author_meta else "Unknown Author",
|
| 81 |
"url": design_url,
|
| 82 |
"css_url": css_url
|
| 83 |
}
|
|
|
|
| 33 |
# Desktop screenshot (1920px width)
|
| 34 |
page = await browser.new_page(viewport={'width': 1920, 'height': 1080})
|
| 35 |
await page.goto(url)
|
| 36 |
+
# Wait for network to be idle (no requests for at least 500ms)
|
| 37 |
+
await page.wait_for_load_state('networkidle')
|
| 38 |
+
|
| 39 |
+
# Wait for all images to be loaded
|
| 40 |
+
await page.evaluate("""() => {
|
| 41 |
+
return Promise.all(
|
| 42 |
+
Array.from(document.images)
|
| 43 |
+
.filter(img => !img.complete)
|
| 44 |
+
.map(img => new Promise(resolve => {
|
| 45 |
+
img.onload = img.onerror = resolve;
|
| 46 |
+
}))
|
| 47 |
+
);
|
| 48 |
+
}""")
|
| 49 |
+
|
| 50 |
+
# Additional wait to ensure any animations/transitions complete
|
| 51 |
+
#await page.wait_for_timeout(2000) # 2 second delay
|
| 52 |
+
|
| 53 |
# Get full height
|
| 54 |
height = await page.evaluate('document.body.scrollHeight')
|
| 55 |
await page.set_viewport_size({'width': 1920, 'height': int(height)})
|
|
|
|
| 58 |
# Mobile screenshot (480px width)
|
| 59 |
page = await browser.new_page(viewport={'width': 480, 'height': 1080})
|
| 60 |
await page.goto(url)
|
| 61 |
+
# Wait for network to be idle (no requests for at least 500ms)
|
| 62 |
+
await page.wait_for_load_state('networkidle')
|
| 63 |
+
|
| 64 |
+
# Wait for all images to be loaded
|
| 65 |
+
await page.evaluate("""() => {
|
| 66 |
+
return Promise.all(
|
| 67 |
+
Array.from(document.images)
|
| 68 |
+
.filter(img => !img.complete)
|
| 69 |
+
.map(img => new Promise(resolve => {
|
| 70 |
+
img.onload = img.onerror = resolve;
|
| 71 |
+
}))
|
| 72 |
+
);
|
| 73 |
+
}""")
|
| 74 |
+
|
| 75 |
+
# Additional wait to ensure any animations/transitions complete
|
| 76 |
+
#await page.wait_for_timeout(2000) # 2 second delay
|
| 77 |
+
|
| 78 |
# Get full height
|
| 79 |
height = await page.evaluate('document.body.scrollHeight')
|
| 80 |
await page.set_viewport_size({'width': 480, 'height': int(height)})
|
|
|
|
| 96 |
print(f"{design_id}: Response status: {response.status_code}")
|
| 97 |
|
| 98 |
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# Extract metadata with error handling
|
| 101 |
try:
|
| 102 |
metadata = {
|
| 103 |
"id": design_id,
|
|
|
|
| 104 |
"url": design_url,
|
| 105 |
"css_url": css_url
|
| 106 |
}
|