Spaces:
Runtime error
Runtime error
Commit
·
5896930
1
Parent(s):
1ef09d2
task: updates scraper for async and better screenshots
Browse files- .gitignore +2 -1
- data_collection/notebook.ipynb +75 -17
.gitignore
CHANGED
|
@@ -23,4 +23,5 @@ dist/
|
|
| 23 |
|
| 24 |
.chainlit/cache
|
| 25 |
|
| 26 |
-
data_collection/
|
|
|
|
|
|
| 23 |
|
| 24 |
.chainlit/cache
|
| 25 |
|
| 26 |
+
data_collection/scraped_designs
|
| 27 |
+
data_collection/analyses
|
data_collection/notebook.ipynb
CHANGED
|
@@ -13,36 +13,94 @@
|
|
| 13 |
},
|
| 14 |
{
|
| 15 |
"cell_type": "code",
|
| 16 |
-
"execution_count":
|
| 17 |
"metadata": {},
|
| 18 |
"outputs": [
|
| 19 |
{
|
| 20 |
"name": "stdout",
|
| 21 |
"output_type": "stream",
|
| 22 |
"text": [
|
| 23 |
-
"Testing scraper with design
|
| 24 |
-
"
|
| 25 |
-
"Success!\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
]
|
| 27 |
}
|
| 28 |
],
|
| 29 |
"source": [
|
| 30 |
-
"from scraper import scrape_design\n",
|
| 31 |
"import asyncio\n",
|
| 32 |
"\n",
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
-
"
|
| 41 |
-
"
|
| 42 |
-
"
|
| 43 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"\n",
|
| 45 |
-
"
|
|
|
|
|
|
|
| 46 |
]
|
| 47 |
},
|
| 48 |
{
|
|
|
|
| 13 |
},
|
| 14 |
{
|
| 15 |
"cell_type": "code",
|
| 16 |
+
"execution_count": null,
|
| 17 |
"metadata": {},
|
| 18 |
"outputs": [
|
| 19 |
{
|
| 20 |
"name": "stdout",
|
| 21 |
"output_type": "stream",
|
| 22 |
"text": [
|
| 23 |
+
"Testing scraper with design 001...\n",
|
| 24 |
+
"001: Response status: 200\n",
|
| 25 |
+
"Success!\n",
|
| 26 |
+
"Testing scraper with design 002...\n",
|
| 27 |
+
"002: Response status: 200\n",
|
| 28 |
+
"Success!\n",
|
| 29 |
+
"Testing scraper with design 003...\n",
|
| 30 |
+
"003: Response status: 200\n"
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"ename": "CancelledError",
|
| 35 |
+
"evalue": "",
|
| 36 |
+
"output_type": "error",
|
| 37 |
+
"traceback": [
|
| 38 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
| 39 |
+
"\u001b[0;31mCancelledError\u001b[0m Traceback (most recent call last)",
|
| 40 |
+
"Cell \u001b[0;32mIn[1], line 16\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mstr\u001b[39m(e)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m test_scraper(test_set)\n",
|
| 41 |
+
"Cell \u001b[0;32mIn[1], line 11\u001b[0m, in \u001b[0;36mtest_scraper\u001b[0;34m(ids)\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTesting scraper with design \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_design_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 11\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m scrape_design(test_design_id)\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSuccess!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
|
| 42 |
+
"File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/data_collection/scraper.py:90\u001b[0m, in \u001b[0;36mscrape_design\u001b[0;34m(design_id)\u001b[0m\n\u001b[1;32m 88\u001b[0m save_css(css_url, directory)\n\u001b[1;32m 89\u001b[0m save_metadata(metadata, directory)\n\u001b[0;32m---> 90\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m take_screenshot(design_url, directory)\n",
|
| 43 |
+
"File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/data_collection/scraper.py:52\u001b[0m, in \u001b[0;36mtake_screenshot\u001b[0;34m(url, directory)\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m page\u001b[38;5;241m.\u001b[39mwait_for_load_state(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnetworkidle\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 51\u001b[0m \u001b[38;5;66;03m# Add a significant delay to ensure background images are loaded\u001b[39;00m\n\u001b[0;32m---> 52\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m page\u001b[38;5;241m.\u001b[39mwait_for_timeout(\u001b[38;5;241m2000\u001b[39m)\n\u001b[1;32m 54\u001b[0m \u001b[38;5;66;03m# Get full height\u001b[39;00m\n\u001b[1;32m 55\u001b[0m height \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m page\u001b[38;5;241m.\u001b[39mevaluate(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdocument.body.scrollHeight\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
|
| 44 |
+
"File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/async_api/_generated.py:11401\u001b[0m, in \u001b[0;36mPage.wait_for_timeout\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 11379\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwait_for_timeout\u001b[39m(\u001b[38;5;28mself\u001b[39m, timeout: \u001b[38;5;28mfloat\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 11380\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Page.wait_for_timeout\u001b[39;00m\n\u001b[1;32m 11381\u001b[0m \n\u001b[1;32m 11382\u001b[0m \u001b[38;5;124;03m Waits for the given `timeout` in milliseconds.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 11397\u001b[0m \u001b[38;5;124;03m A timeout to wait for\u001b[39;00m\n\u001b[1;32m 11398\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 11400\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m mapping\u001b[38;5;241m.\u001b[39mfrom_maybe_impl(\n\u001b[0;32m> 11401\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_impl_obj\u001b[38;5;241m.\u001b[39mwait_for_timeout(timeout\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[1;32m 11402\u001b[0m )\n",
|
| 45 |
+
"File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/_impl/_page.py:1073\u001b[0m, in \u001b[0;36mPage.wait_for_timeout\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1072\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwait_for_timeout\u001b[39m(\u001b[38;5;28mself\u001b[39m, timeout: \u001b[38;5;28mfloat\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1073\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_main_frame\u001b[38;5;241m.\u001b[39mwait_for_timeout(timeout)\n",
|
| 46 |
+
"File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/_impl/_frame.py:756\u001b[0m, in \u001b[0;36mFrame.wait_for_timeout\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 755\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwait_for_timeout\u001b[39m(\u001b[38;5;28mself\u001b[39m, timeout: \u001b[38;5;28mfloat\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 756\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_channel\u001b[38;5;241m.\u001b[39msend(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwaitForTimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m, locals_to_params(\u001b[38;5;28mlocals\u001b[39m()))\n",
|
| 47 |
+
"File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/_impl/_connection.py:61\u001b[0m, in \u001b[0;36mChannel.send\u001b[0;34m(self, method, params)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21msend\u001b[39m(\u001b[38;5;28mself\u001b[39m, method: \u001b[38;5;28mstr\u001b[39m, params: Dict \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m---> 61\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_connection\u001b[38;5;241m.\u001b[39mwrap_api_call(\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mlambda\u001b[39;00m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_send(method, params, \u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_internal_type,\n\u001b[1;32m 64\u001b[0m )\n",
|
| 48 |
+
"File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/_impl/_connection.py:526\u001b[0m, in \u001b[0;36mConnection.wrap_api_call\u001b[0;34m(self, cb, is_internal)\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_api_zone\u001b[38;5;241m.\u001b[39mset(parsed_st)\n\u001b[1;32m 525\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 526\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m cb()\n\u001b[1;32m 527\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m error:\n\u001b[1;32m 528\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m rewrite_error(error, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mparsed_st[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mapiName\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00merror\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n",
|
| 49 |
+
"File \u001b[0;32m~/Desktop/Projects/ai-maker-space/code/ImagineUI/.venv/lib/python3.11/site-packages/playwright/_impl/_connection.py:92\u001b[0m, in \u001b[0;36mChannel._inner_send\u001b[0;34m(self, method, params, return_as_dict)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error\n\u001b[1;32m 89\u001b[0m callback \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_connection\u001b[38;5;241m.\u001b[39m_send_message_to_server(\n\u001b[1;32m 90\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_object, method, _filter_none(params)\n\u001b[1;32m 91\u001b[0m )\n\u001b[0;32m---> 92\u001b[0m done, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio\u001b[38;5;241m.\u001b[39mwait(\n\u001b[1;32m 93\u001b[0m {\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_connection\u001b[38;5;241m.\u001b[39m_transport\u001b[38;5;241m.\u001b[39mon_error_future,\n\u001b[1;32m 95\u001b[0m callback\u001b[38;5;241m.\u001b[39mfuture,\n\u001b[1;32m 96\u001b[0m },\n\u001b[1;32m 97\u001b[0m return_when\u001b[38;5;241m=\u001b[39masyncio\u001b[38;5;241m.\u001b[39mFIRST_COMPLETED,\n\u001b[1;32m 98\u001b[0m )\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m callback\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mdone():\n\u001b[1;32m 100\u001b[0m callback\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mcancel()\n",
|
| 50 |
+
"File \u001b[0;32m~/.local/share/uv/python/cpython-3.11.11-macos-x86_64-none/lib/python3.11/asyncio/tasks.py:428\u001b[0m, in \u001b[0;36mwait\u001b[0;34m(fs, timeout, return_when)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPassing coroutines is forbidden, use tasks explicitly.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 427\u001b[0m loop \u001b[38;5;241m=\u001b[39m events\u001b[38;5;241m.\u001b[39mget_running_loop()\n\u001b[0;32m--> 428\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m _wait(fs, timeout, return_when, loop)\n",
|
| 51 |
+
"File \u001b[0;32m~/.local/share/uv/python/cpython-3.11.11-macos-x86_64-none/lib/python3.11/asyncio/tasks.py:535\u001b[0m, in \u001b[0;36m_wait\u001b[0;34m(fs, timeout, return_when, loop)\u001b[0m\n\u001b[1;32m 532\u001b[0m f\u001b[38;5;241m.\u001b[39madd_done_callback(_on_completion)\n\u001b[1;32m 534\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 535\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m waiter\n\u001b[1;32m 536\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 537\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout_handle \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
|
| 52 |
+
"\u001b[0;31mCancelledError\u001b[0m: "
|
| 53 |
]
|
| 54 |
}
|
| 55 |
],
|
| 56 |
"source": [
|
| 57 |
+
"from data_collection.scraper import scrape_design\n",
|
| 58 |
"import asyncio\n",
|
| 59 |
"\n",
|
| 60 |
+
"async def test_scraper(ids, batch_size=5):\n",
|
| 61 |
+
" \"\"\"\n",
|
| 62 |
+
" Asynchronously scrape designs in batches to avoid overwhelming resources.\n",
|
| 63 |
+
" \n",
|
| 64 |
+
" Args:\n",
|
| 65 |
+
" ids (list): List of design IDs to scrape\n",
|
| 66 |
+
" batch_size (int): Number of designs to process concurrently\n",
|
| 67 |
+
" \"\"\"\n",
|
| 68 |
+
" print(f\"Starting scrape of {len(ids)} designs...\")\n",
|
| 69 |
+
" \n",
|
| 70 |
+
" successful = 0\n",
|
| 71 |
+
" failed = 0\n",
|
| 72 |
+
" \n",
|
| 73 |
+
" # Process in batches\n",
|
| 74 |
+
" for i in range(0, len(ids), batch_size):\n",
|
| 75 |
+
" batch = ids[i:i + batch_size]\n",
|
| 76 |
+
" print(f\"\\nProcessing batch {i//batch_size + 1} ({len(batch)} designs)...\")\n",
|
| 77 |
+
" \n",
|
| 78 |
+
" # Create tasks for current batch\n",
|
| 79 |
+
" tasks = [scrape_design(design_id) for design_id in batch]\n",
|
| 80 |
+
" \n",
|
| 81 |
+
" # Run batch tasks concurrently\n",
|
| 82 |
+
" results = await asyncio.gather(*tasks, return_exceptions=True)\n",
|
| 83 |
+
" \n",
|
| 84 |
+
" # Process batch results\n",
|
| 85 |
+
" for design_id, result in zip(batch, results):\n",
|
| 86 |
+
" if isinstance(result, Exception):\n",
|
| 87 |
+
" print(f\"Error scraping design {design_id}: {str(result)}\")\n",
|
| 88 |
+
" failed += 1\n",
|
| 89 |
+
" else:\n",
|
| 90 |
+
" print(f\"Successfully scraped design {design_id}\")\n",
|
| 91 |
+
" successful += 1\n",
|
| 92 |
+
" \n",
|
| 93 |
+
" # Optional: Add delay between batches\n",
|
| 94 |
+
" # await asyncio.sleep(1)\n",
|
| 95 |
+
" \n",
|
| 96 |
+
" print(f\"\\nScraping complete:\")\n",
|
| 97 |
+
" print(f\"Successful: {successful}\")\n",
|
| 98 |
+
" print(f\"Failed: {failed}\")\n",
|
| 99 |
+
" print(f\"Total: {len(ids)}\")\n",
|
| 100 |
"\n",
|
| 101 |
+
"# Example usage with batch processing:\n",
|
| 102 |
+
"test_set = [f\"{i:03d}\" for i in range(1, 222)]\n",
|
| 103 |
+
"await test_scraper(test_set, batch_size=5)"
|
| 104 |
]
|
| 105 |
},
|
| 106 |
{
|