Technologic101 commited on
Commit
17bef4b
·
1 Parent(s): e2784d5

task: ensure design scraper captures images

Browse files
Files changed (1) hide show
  1. scraper.py +34 -11
scraper.py CHANGED
@@ -33,8 +33,23 @@ async def take_screenshot(url, directory):
33
  # Desktop screenshot (1920px width)
34
  page = await browser.new_page(viewport={'width': 1920, 'height': 1080})
35
  await page.goto(url)
36
- # Wait for fade transitions
37
- await page.wait_for_timeout(1500)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # Get full height
39
  height = await page.evaluate('document.body.scrollHeight')
40
  await page.set_viewport_size({'width': 1920, 'height': int(height)})
@@ -43,8 +58,23 @@ async def take_screenshot(url, directory):
43
  # Mobile screenshot (480px width)
44
  page = await browser.new_page(viewport={'width': 480, 'height': 1080})
45
  await page.goto(url)
46
- # Wait for fade transitions
47
- await page.wait_for_timeout(1500)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  # Get full height
49
  height = await page.evaluate('document.body.scrollHeight')
50
  await page.set_viewport_size({'width': 480, 'height': int(height)})
@@ -66,18 +96,11 @@ async def scrape_design(design_id):
66
  print(f"{design_id}: Response status: {response.status_code}")
67
 
68
  soup = BeautifulSoup(response.text, "html.parser")
69
- author_meta = soup.select_one('meta[name="author"]')
70
-
71
- # Debug found elements
72
- print(f"{design_id}: \nFound elements:")
73
- print(f"h1: {soup.select_one('h1').text}")
74
- print(f"author: {author_meta['content']}")
75
 
76
  # Extract metadata with error handling
77
  try:
78
  metadata = {
79
  "id": design_id,
80
- "author": author_meta["content"] if author_meta else "Unknown Author",
81
  "url": design_url,
82
  "css_url": css_url
83
  }
 
33
  # Desktop screenshot (1920px width)
34
  page = await browser.new_page(viewport={'width': 1920, 'height': 1080})
35
  await page.goto(url)
36
+ # Wait for network to be idle (no requests for at least 500ms)
37
+ await page.wait_for_load_state('networkidle')
38
+
39
+ # Wait for all images to be loaded
40
+ await page.evaluate("""() => {
41
+ return Promise.all(
42
+ Array.from(document.images)
43
+ .filter(img => !img.complete)
44
+ .map(img => new Promise(resolve => {
45
+ img.onload = img.onerror = resolve;
46
+ }))
47
+ );
48
+ }""")
49
+
50
+ # Additional wait to ensure any animations/transitions complete
51
+ #await page.wait_for_timeout(2000) # 2 second delay
52
+
53
  # Get full height
54
  height = await page.evaluate('document.body.scrollHeight')
55
  await page.set_viewport_size({'width': 1920, 'height': int(height)})
 
58
  # Mobile screenshot (480px width)
59
  page = await browser.new_page(viewport={'width': 480, 'height': 1080})
60
  await page.goto(url)
61
+ # Wait for network to be idle (no requests for at least 500ms)
62
+ await page.wait_for_load_state('networkidle')
63
+
64
+ # Wait for all images to be loaded
65
+ await page.evaluate("""() => {
66
+ return Promise.all(
67
+ Array.from(document.images)
68
+ .filter(img => !img.complete)
69
+ .map(img => new Promise(resolve => {
70
+ img.onload = img.onerror = resolve;
71
+ }))
72
+ );
73
+ }""")
74
+
75
+ # Additional wait to ensure any animations/transitions complete
76
+ #await page.wait_for_timeout(2000) # 2 second delay
77
+
78
  # Get full height
79
  height = await page.evaluate('document.body.scrollHeight')
80
  await page.set_viewport_size({'width': 480, 'height': int(height)})
 
96
  print(f"{design_id}: Response status: {response.status_code}")
97
 
98
  soup = BeautifulSoup(response.text, "html.parser")
 
 
 
 
 
 
99
 
100
  # Extract metadata with error handling
101
  try:
102
  metadata = {
103
  "id": design_id,
 
104
  "url": design_url,
105
  "css_url": css_url
106
  }