Technologic101 commited on
Commit
5896930
·
1 Parent(s): 1ef09d2

task: updates scraper for async and better screenshots

Browse files
Files changed (2) hide show
  1. .gitignore +2 -1
  2. data_collection/notebook.ipynb +75 -17
.gitignore CHANGED
@@ -23,4 +23,5 @@ dist/
23
 
24
  .chainlit/cache
25
 
26
- data_collection/designs
 
 
23
 
24
  .chainlit/cache
25
 
26
+ data_collection/scraped_designs
27
+ data_collection/analyses
data_collection/notebook.ipynb CHANGED
@@ -13,36 +13,94 @@
13
  },
14
  {
15
  "cell_type": "code",
16
- "execution_count": 5,
17
  "metadata": {},
18
  "outputs": [
19
  {
20
  "name": "stdout",
21
  "output_type": "stream",
22
  "text": [
23
- "Testing scraper with design 112...\n",
24
- "112: Response status: 200\n",
25
- "Success!\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  ]
27
  }
28
  ],
29
  "source": [
30
- "from scraper import scrape_design\n",
31
  "import asyncio\n",
32
  "\n",
33
- "# Pick from a range of 001 to 221\n",
34
- "#test_set = [f\"{i:03d}\" for i in range(1, 21)]\n",
35
- "test_set = [\"112\"]\n",
36
- "async def test_scraper(ids):\n",
37
- " for test_design_id in ids:\n",
38
- " try:\n",
39
- " print(f\"Testing scraper with design {test_design_id}...\")\n",
40
- " await scrape_design(test_design_id)\n",
41
- " print(\"Success!\")\n",
42
- " except Exception as e:\n",
43
- " print(f\"Error: {str(e)}\")\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  "\n",
45
- "await test_scraper(test_set)"
 
 
46
  ]
47
  },
48
  {
 
13
  },
14
  {
15
  "cell_type": "code",
16
+ "execution_count": null,
17
  "metadata": {},
18
  "outputs": [
19
  {
20
  "name": "stdout",
21
  "output_type": "stream",
22
  "text": [
23
+ "Testing scraper with design 001...\n",
24
+ "001: Response status: 200\n",
25
+ "Success!\n",
26
+ "Testing scraper with design 002...\n",
27
+ "002: Response status: 200\n",
28
+ "Success!\n",
29
+ "Testing scraper with design 003...\n",
30
+ "003: Response status: 200\n"
31
+ ]
32
+ },
33
+ {
34
+ "ename": "CancelledError",
35
+ "evalue": "",
36
+ "output_type": "error",
37
+ "traceback": [
38
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
39
+ "\u001b[0;31mCancelledError\u001b[0m Traceback (most recent call last)",
40
+ "Cell \u001b[0;32mIn[1], line 16\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mstr\u001b[39m(e)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m test_scraper(test_set)\n",
41
+ "Cell \u001b[0;32mIn[1], line 11\u001b[0m, in \u001b[0;36mtest_scraper\u001b[0;34m(ids)\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTesting scraper with design \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_design_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 11\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m scrape_design(test_design_id)\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSuccess!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
42
+ "File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/data_collection/scraper.py:90\u001b[0m, in \u001b[0;36mscrape_design\u001b[0;34m(design_id)\u001b[0m\n\u001b[1;32m 88\u001b[0m save_css(css_url, directory)\n\u001b[1;32m 89\u001b[0m save_metadata(metadata, directory)\n\u001b[0;32m---> 90\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m take_screenshot(design_url, directory)\n",
43
+ "File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/data_collection/scraper.py:52\u001b[0m, in \u001b[0;36mtake_screenshot\u001b[0;34m(url, directory)\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m page\u001b[38;5;241m.\u001b[39mwait_for_load_state(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnetworkidle\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 51\u001b[0m \u001b[38;5;66;03m# Add a significant delay to ensure background images are loaded\u001b[39;00m\n\u001b[0;32m---> 52\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m page\u001b[38;5;241m.\u001b[39mwait_for_timeout(\u001b[38;5;241m2000\u001b[39m)\n\u001b[1;32m 54\u001b[0m \u001b[38;5;66;03m# Get full height\u001b[39;00m\n\u001b[1;32m 55\u001b[0m height \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m page\u001b[38;5;241m.\u001b[39mevaluate(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdocument.body.scrollHeight\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
44
+ "File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/async_api/_generated.py:11401\u001b[0m, in \u001b[0;36mPage.wait_for_timeout\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 11379\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwait_for_timeout\u001b[39m(\u001b[38;5;28mself\u001b[39m, timeout: \u001b[38;5;28mfloat\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 11380\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Page.wait_for_timeout\u001b[39;00m\n\u001b[1;32m 11381\u001b[0m \n\u001b[1;32m 11382\u001b[0m \u001b[38;5;124;03m Waits for the given `timeout` in milliseconds.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 11397\u001b[0m \u001b[38;5;124;03m A timeout to wait for\u001b[39;00m\n\u001b[1;32m 11398\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 11400\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m mapping\u001b[38;5;241m.\u001b[39mfrom_maybe_impl(\n\u001b[0;32m> 11401\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_impl_obj\u001b[38;5;241m.\u001b[39mwait_for_timeout(timeout\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[1;32m 11402\u001b[0m )\n",
45
+ "File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/_impl/_page.py:1073\u001b[0m, in \u001b[0;36mPage.wait_for_timeout\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1072\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwait_for_timeout\u001b[39m(\u001b[38;5;28mself\u001b[39m, timeout: \u001b[38;5;28mfloat\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1073\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_main_frame\u001b[38;5;241m.\u001b[39mwait_for_timeout(timeout)\n",
46
+ "File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/_impl/_frame.py:756\u001b[0m, in \u001b[0;36mFrame.wait_for_timeout\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 755\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwait_for_timeout\u001b[39m(\u001b[38;5;28mself\u001b[39m, timeout: \u001b[38;5;28mfloat\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 756\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_channel\u001b[38;5;241m.\u001b[39msend(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwaitForTimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m, locals_to_params(\u001b[38;5;28mlocals\u001b[39m()))\n",
47
+ "File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/_impl/_connection.py:61\u001b[0m, in \u001b[0;36mChannel.send\u001b[0;34m(self, method, params)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21msend\u001b[39m(\u001b[38;5;28mself\u001b[39m, method: \u001b[38;5;28mstr\u001b[39m, params: Dict \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m---> 61\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_connection\u001b[38;5;241m.\u001b[39mwrap_api_call(\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mlambda\u001b[39;00m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_send(method, params, \u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_internal_type,\n\u001b[1;32m 64\u001b[0m )\n",
48
+ "File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/_impl/_connection.py:526\u001b[0m, in \u001b[0;36mConnection.wrap_api_call\u001b[0;34m(self, cb, is_internal)\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_api_zone\u001b[38;5;241m.\u001b[39mset(parsed_st)\n\u001b[1;32m 525\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 526\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m cb()\n\u001b[1;32m 527\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m error:\n\u001b[1;32m 528\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m rewrite_error(error, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mparsed_st[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mapiName\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00merror\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n",
49
+ "File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/_impl/_connection.py:92\u001b[0m, in \u001b[0;36mChannel._inner_send\u001b[0;34m(self, method, params, return_as_dict)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error\n\u001b[1;32m 89\u001b[0m callback \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_connection\u001b[38;5;241m.\u001b[39m_send_message_to_server(\n\u001b[1;32m 90\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_object, method, _filter_none(params)\n\u001b[1;32m 91\u001b[0m )\n\u001b[0;32m---> 92\u001b[0m done, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio\u001b[38;5;241m.\u001b[39mwait(\n\u001b[1;32m 93\u001b[0m {\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_connection\u001b[38;5;241m.\u001b[39m_transport\u001b[38;5;241m.\u001b[39mon_error_future,\n\u001b[1;32m 95\u001b[0m callback\u001b[38;5;241m.\u001b[39mfuture,\n\u001b[1;32m 96\u001b[0m },\n\u001b[1;32m 97\u001b[0m return_when\u001b[38;5;241m=\u001b[39masyncio\u001b[38;5;241m.\u001b[39mFIRST_COMPLETED,\n\u001b[1;32m 98\u001b[0m )\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m callback\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mdone():\n\u001b[1;32m 100\u001b[0m callback\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mcancel()\n",
50
+ "File \u001b[0;32m~/.local/share/uv/python/cpython-3.11.11-macos-x86_64-none/lib/python3.11/asyncio/tasks.py:428\u001b[0m, in \u001b[0;36mwait\u001b[0;34m(fs, timeout, return_when)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPassing coroutines is forbidden, use tasks explicitly.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 427\u001b[0m loop \u001b[38;5;241m=\u001b[39m events\u001b[38;5;241m.\u001b[39mget_running_loop()\n\u001b[0;32m--> 428\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m _wait(fs, timeout, return_when, loop)\n",
51
+ "File \u001b[0;32m~/.local/share/uv/python/cpython-3.11.11-macos-x86_64-none/lib/python3.11/asyncio/tasks.py:535\u001b[0m, in \u001b[0;36m_wait\u001b[0;34m(fs, timeout, return_when, loop)\u001b[0m\n\u001b[1;32m 532\u001b[0m f\u001b[38;5;241m.\u001b[39madd_done_callback(_on_completion)\n\u001b[1;32m 534\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 535\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m waiter\n\u001b[1;32m 536\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 537\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout_handle \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
52
+ "\u001b[0;31mCancelledError\u001b[0m: "
53
  ]
54
  }
55
  ],
56
  "source": [
57
+ "from data_collection.scraper import scrape_design\n",
58
  "import asyncio\n",
59
  "\n",
60
+ "async def test_scraper(ids, batch_size=5):\n",
61
+ " \"\"\"\n",
62
+ " Asynchronously scrape designs in batches to avoid overwhelming resources.\n",
63
+ " \n",
64
+ " Args:\n",
65
+ " ids (list): List of design IDs to scrape\n",
66
+ " batch_size (int): Number of designs to process concurrently\n",
67
+ " \"\"\"\n",
68
+ " print(f\"Starting scrape of {len(ids)} designs...\")\n",
69
+ " \n",
70
+ " successful = 0\n",
71
+ " failed = 0\n",
72
+ " \n",
73
+ " # Process in batches\n",
74
+ " for i in range(0, len(ids), batch_size):\n",
75
+ " batch = ids[i:i + batch_size]\n",
76
+ " print(f\"\\nProcessing batch {i//batch_size + 1} ({len(batch)} designs)...\")\n",
77
+ " \n",
78
+ " # Create tasks for current batch\n",
79
+ " tasks = [scrape_design(design_id) for design_id in batch]\n",
80
+ " \n",
81
+ " # Run batch tasks concurrently\n",
82
+ " results = await asyncio.gather(*tasks, return_exceptions=True)\n",
83
+ " \n",
84
+ " # Process batch results\n",
85
+ " for design_id, result in zip(batch, results):\n",
86
+ " if isinstance(result, Exception):\n",
87
+ " print(f\"Error scraping design {design_id}: {str(result)}\")\n",
88
+ " failed += 1\n",
89
+ " else:\n",
90
+ " print(f\"Successfully scraped design {design_id}\")\n",
91
+ " successful += 1\n",
92
+ " \n",
93
+ " # Optional: Add delay between batches\n",
94
+ " # await asyncio.sleep(1)\n",
95
+ " \n",
96
+ " print(f\"\\nScraping complete:\")\n",
97
+ " print(f\"Successful: {successful}\")\n",
98
+ " print(f\"Failed: {failed}\")\n",
99
+ " print(f\"Total: {len(ids)}\")\n",
100
  "\n",
101
+ "# Example usage with batch processing:\n",
102
+ "test_set = [f\"{i:03d}\" for i in range(1, 222)]\n",
103
+ "await test_scraper(test_set, batch_size=5)"
104
  ]
105
  },
106
  {