Spaces:
Runtime error
Runtime error
| import os | |
| import io | |
| import time | |
| import math | |
| import traceback | |
| from datetime import datetime | |
| import pandas as pd | |
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.common.exceptions import ( | |
| NoSuchElementException, | |
| TimeoutException, | |
| StaleElementReferenceException, | |
| ) | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.chrome.service import Service | |
| from huggingface_hub import HfApi, HfFolder | |
| # Configuration: set via Space secrets | |
| HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "your-username/thingiverse-data") | |
| HF_TOKEN = HfFolder.get_token() | |
| def upload_df_to_hf(df: pd.DataFrame, filename: str): | |
| """ | |
| Upload a pandas DataFrame directly to HF dataset without writing to disk. | |
| """ | |
| buffer = io.StringIO() | |
| df.to_csv(buffer, index=False) | |
| buffer.seek(0) | |
| api = HfApi() | |
| api.upload_file( | |
| path_or_fileobj=buffer, | |
| path_in_repo=filename, | |
| repo_id=HF_REPO_ID, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| create_pr=False, | |
| ) | |
| print(f"✅ Uploaded {filename} to {HF_REPO_ID}") | |
| def safe_get_text(driver, xpath, retries=1, delay=0.5): | |
| for _ in range(retries): | |
| try: | |
| return driver.find_element(By.XPATH, xpath).text | |
| except (StaleElementReferenceException, NoSuchElementException): | |
| time.sleep(delay) | |
| return "" | |
| def initialize_driver(): | |
| # path to the chromedriver you installed in Dockerfile | |
| service = Service("/usr/local/bin/chromedriver") | |
| options = Options() | |
| # tell Selenium exactly where Chrome itself lives | |
| options.binary_location = "/usr/bin/google-chrome-stable" | |
| options.add_argument("--headless=new") | |
| options.add_argument("--disable-gpu") | |
| options.add_argument("--no-sandbox") | |
| options.add_argument("--disable-dev-shm-usage") | |
| options.add_argument("--window-size=1920,1080") | |
| # launch Chrome via the Service | |
| driver = webdriver.Chrome(service=service, options=options) | |
| return driver | |
| def process_batch(start_id, end_id, worker_id): | |
| print(f"Worker {worker_id} processing IDs {start_id} to {end_id}") | |
| try: | |
| driver = initialize_driver() | |
| wait = WebDriverWait(driver, 10) | |
| results = [] | |
| total = end_id - start_id + 1 | |
| count = 0 | |
| for thing_id in range(end_id, start_id - 1, -1): | |
| count += 1 | |
| url = f"https://www.thingiverse.com/thing:{thing_id}" | |
| print(f"[{worker_id}] ({count}/{total}) {url}") | |
| try: | |
| driver.get(url) | |
| except Exception: | |
| continue | |
| time.sleep(1) | |
| # skip error pages | |
| try: | |
| driver.find_element(By.XPATH, "//*[contains(@class,'Layout__errorPageCard')]") | |
| continue | |
| except NoSuchElementException: | |
| pass | |
| # wait for title link | |
| try: | |
| wait.until(EC.presence_of_element_located(( | |
| By.XPATH, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]" | |
| ))) | |
| except TimeoutException: | |
| continue | |
| title = safe_get_text(driver, "//*[contains(@class,'DetailPageTitle__thingTitleName')]") | |
| author = safe_get_text(driver, "//a[contains(@class,'DetailPageTitle__thingTitleLink')]") | |
| date_posted = safe_get_text( | |
| driver, | |
| "//a[contains(@class,'DetailPageTitle__thingTitleLink')]/following-sibling::div" | |
| ) | |
| def extract_aria(label): | |
| try: | |
| return driver.find_element(By.XPATH, f"//*[@aria-label='{label}']").text | |
| except NoSuchElementException: | |
| return "" | |
| remixes = extract_aria("Remixes") | |
| files = extract_aria("Files") | |
| makes = extract_aria("Makes") | |
| comments = extract_aria("Comments") | |
| tags = [] | |
| try: | |
| tags_el = driver.find_element(By.XPATH, "//*[contains(@class,'TagList__tagList')]") | |
| tags = [a.text for a in tags_el.find_elements(By.TAG_NAME, "a")] | |
| except NoSuchElementException: | |
| pass | |
| results.append({ | |
| "URL": url, | |
| "Title": title, | |
| "Author": author, | |
| "Date": date_posted, | |
| "Remixes": remixes, | |
| "Files": files, | |
| "Makes": makes, | |
| "Comments": comments, | |
| "Tags": tags | |
| }) | |
| # checkpoint every 10 items | |
| if len(results) % 10 == 0: | |
| df_chk = pd.DataFrame(results) | |
| chk_name = f"worker_{worker_id}_{start_id}_{end_id}_chk_{len(results)}.csv" | |
| upload_df_to_hf(df_chk, chk_name) | |
| # final batch upload | |
| if results: | |
| df_final = pd.DataFrame(results) | |
| final_name = f"worker_{worker_id}_{start_id}_{end_id}_final.csv" | |
| upload_df_to_hf(df_final, final_name) | |
| driver.quit() | |
| return results | |
| except Exception as e: | |
| print(f"Worker {worker_id} error: {e}") | |
| traceback.print_exc() | |
| return [] | |
| def main(): | |
| # configure your range & parallelism | |
| start_thing = 6993281 | |
| end_thing = 7003281 | |
| num_workers = 5 | |
| # split work | |
| total = end_thing - start_thing + 1 | |
| per = math.ceil(total / num_workers) | |
| batches = [] | |
| for i in range(num_workers): | |
| s = start_thing + i * per | |
| e = min(s + per - 1, end_thing) | |
| batches.append((s, e, i+1)) | |
| all_results = [] | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| with ThreadPoolExecutor(max_workers=num_workers) as ex: | |
| futures = {ex.submit(process_batch, s, e, wid): (s,e,wid) | |
| for s,e,wid in batches} | |
| for fut in as_completed(futures): | |
| all_results.extend(fut.result()) | |
| # upload combined file | |
| if all_results: | |
| df_all = pd.DataFrame(all_results) | |
| upload_df_to_hf(df_all, f"thingiverse_{start_thing}_{end_thing}_all.csv") | |
| if __name__ == "__main__": | |
| main() | |