{ "cells": [ { "cell_type": "markdown", "id": "1111ea95-d385-49b9-a4d9-ef886ace5c7a", "metadata": { "execution": { "iopub.execute_input": "2025-02-06T11:24:25.566747Z", "iopub.status.busy": "2025-02-06T11:24:25.566066Z", "iopub.status.idle": "2025-02-06T11:24:25.571748Z", "shell.execute_reply": "2025-02-06T11:24:25.571305Z", "shell.execute_reply.started": "2025-02-06T11:24:25.566705Z" } }, "source": [ "# 0 Scraping Metadata and Dataset consolidation\n" ] }, { "cell_type": "markdown", "id": "6632505a-e7ca-4463-9ffc-e36fad42235f", "metadata": {}, "source": [ "## IMAGES\n", "---" ] }, { "cell_type": "markdown", "id": "e3388bac-bb71-40bc-a693-9ac7a2d5f32c", "metadata": { "execution": { "iopub.execute_input": "2025-02-06T10:08:22.229784Z", "iopub.status.busy": "2025-02-06T10:08:22.229287Z", "iopub.status.idle": "2025-02-06T10:08:22.232210Z", "shell.execute_reply": "2025-02-06T10:08:22.231793Z", "shell.execute_reply.started": "2025-02-06T10:08:22.229766Z" } }, "source": [ "### Step 1: Image metadata scraping, sorting and CSV consolidation" ] }, { "cell_type": "code", "execution_count": 1, "id": "f8decb63-43f5-4731-823d-94632eee7618", "metadata": { "execution": { "iopub.execute_input": "2025-02-08T19:37:51.115763Z", "iopub.status.busy": "2025-02-08T19:37:51.114573Z", "iopub.status.idle": "2025-02-08T19:37:51.170027Z", "shell.execute_reply": "2025-02-08T19:37:51.169401Z", "shell.execute_reply.started": "2025-02-08T19:37:51.115738Z" } }, "outputs": [], "source": [ "import os\n", "import json\n", "import csv\n", "import requests\n", "from datetime import datetime\n", "import time\n", "from pathlib import Path\n", "import hashlib\n", "import pandas as pd\n", "import sys\n", "from datetime import datetime, timedelta\n", "import shutil" ] }, { "cell_type": "code", "execution_count": 2, "id": "4b2426c3-96a0-468e-b6dc-78dea9c3e92b", "metadata": { "execution": { "iopub.execute_input": "2025-02-08T19:37:51.809027Z", "iopub.status.busy": "2025-02-08T19:37:51.808835Z", "iopub.status.idle": "2025-02-08T19:37:51.812922Z", "shell.execute_reply": "2025-02-08T19:37:51.812429Z", "shell.execute_reply.started": "2025-02-08T19:37:51.809009Z" } }, "outputs": [], "source": [ "current_dir = Path.cwd()" ] }, { "cell_type": "markdown", "id": "11647bb7-5ce9-414a-8486-5bdce8d9cfea", "metadata": { "execution": { "iopub.execute_input": "2025-02-06T12:49:43.126762Z", "iopub.status.busy": "2025-02-06T12:49:43.125797Z", "iopub.status.idle": "2025-02-06T12:49:43.129759Z", "shell.execute_reply": "2025-02-06T12:49:43.129176Z", "shell.execute_reply.started": "2025-02-06T12:49:43.126736Z" } }, "source": [ "## MODELS" ] }, { "cell_type": "markdown", "id": "5c9cb5e3-7cea-4574-9319-f3cd89354b1f", "metadata": { "execution": { "iopub.execute_input": "2025-02-06T13:23:38.017078Z", "iopub.status.busy": "2025-02-06T13:23:38.016639Z", "iopub.status.idle": "2025-02-06T13:23:38.019993Z", "shell.execute_reply": "2025-02-06T13:23:38.019549Z", "shell.execute_reply.started": "2025-02-06T13:23:38.017053Z" } }, "source": [ "### Step 1: Scrape model metadata" ] }, { "cell_type": "markdown", "id": "83e12b9f-5ae1-407d-9754-5979d837f787", "metadata": { "execution": { "iopub.execute_input": "2025-02-06T14:06:04.857874Z", "iopub.status.busy": "2025-02-06T14:06:04.857500Z", "iopub.status.idle": "2025-02-06T14:06:04.860438Z", "shell.execute_reply": "2025-02-06T14:06:04.860030Z", "shell.execute_reply.started": "2025-02-06T14:06:04.857856Z" } }, "source": [ "#### the resulting files will appear in data/raw/model_metadata as *.json" ] }, { "cell_type": "code", "execution_count": 12, "id": "5db9c00e", "metadata": {}, "outputs": [], "source": [ "key_karussell = current_dir.parent / 'misc/credentials/civitai_api_keys.txt'\n", "directory_path = current_dir.parent / 'data/raw/model_metadata/'" ] }, { "cell_type": "code", "execution_count": 13, "id": "41ee14e0-fb78-4f91-aba1-13faf05af7d8", "metadata": { "execution": { "iopub.execute_input": "2025-02-08T19:37:56.687832Z", "iopub.status.busy": "2025-02-08T19:37:56.687251Z", "iopub.status.idle": "2025-02-08T19:37:56.696572Z", "shell.execute_reply": "2025-02-08T19:37:56.696059Z", "shell.execute_reply.started": "2025-02-08T19:37:56.687809Z" } }, "outputs": [], "source": [ "import datetime\n", "\n", "def load_api_keys():\n", " \"\"\"Load API keys from a text file, one per line.\"\"\"\n", " if not os.path.exists(key_karussell):\n", " raise FileNotFoundError(f\"API key file '{API_KEYS_FILE}' not found!\")\n", " \n", " with open(key_karussell, 'r') as file:\n", " keys = [line.strip() for line in file if line.strip()]\n", " \n", " if not keys:\n", " raise ValueError(\"No API keys found in the file!\")\n", " \n", " return keys\n", "\n", "def get_model_metadata():\n", " base_url = \"https://civitai.com/api/v1/models\"\n", " params = {\"sort\": \"Newest\", \"nsfw\": True}\n", "\n", " # Load API keys\n", " api_keys = load_api_keys()\n", " key_index = 0 # Start with the first key\n", "\n", " page_counter = 0\n", " max_pages = 300000000 # Adjust as needed\n", " os.makedirs(directory_path, exist_ok=True)\n", "\n", " while True:\n", " if page_counter >= max_pages:\n", " print(f\"Reached the limit of {max_pages} pages.\")\n", " break\n", "\n", " headers = {\n", " \"Accept\": \"application/json\",\n", " \"Authorization\": f\"Bearer {api_keys[key_index]}\"\n", " }\n", "\n", " response = requests.get(base_url, headers=headers, params=params)\n", "\n", " if response.status_code == 200:\n", " data = response.json()\n", " page_counter += 1\n", "\n", " # Add timestamp\n", " formatted_timestamp = datetime.datetime.now().strftime(\"data obtained on the %d.%m.%Y at %H:%M CEST\")\n", "\n", " data['timestamp'] = formatted_timestamp\n", "\n", " # Save data to file\n", " file_path = os.path.join(directory_path, f'newest_models_{page_counter}.json')\n", " with open(file_path, 'w', encoding='utf-8') as file:\n", " json.dump(data, file, indent=4)\n", "\n", " # Check for nextCursor\n", " next_cursor = data.get('metadata', {}).get('nextCursor')\n", " if not next_cursor:\n", " print(\"No more data available.\")\n", " break\n", " else:\n", " params['cursor'] = next_cursor\n", " \n", " elif response.status_code in (401, 403): # Unauthorized or Forbidden\n", " print(f\"API Key {key_index + 1} failed with status {response.status_code}. Trying next key...\")\n", " key_index += 1\n", "\n", " if key_index >= len(api_keys):\n", " print(\"All API keys failed. Exiting.\")\n", " break # Stop if all keys fail\n", " \n", " else:\n", " print(f\"Failed to fetch data: HTTP {response.status_code}\")\n", " break # Stop on other errors\n" ] }, { "cell_type": "markdown", "id": "8f43ce23-5986-4b67-be2d-453831af9a6e", "metadata": {}, "source": [ "uncomment this to get model metadata" ] }, { "cell_type": "code", "execution_count": 14, "id": "3f95b4ba-5742-4268-b2e8-de9145faf495", "metadata": { "execution": { "iopub.execute_input": "2025-02-08T19:37:58.117386Z", "iopub.status.busy": "2025-02-08T19:37:58.117158Z", "iopub.status.idle": "2025-02-08T19:37:58.121162Z", "shell.execute_reply": "2025-02-08T19:37:58.120580Z", "shell.execute_reply.started": "2025-02-08T19:37:58.117369Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Failed to fetch data: HTTP 500\n" ] } ], "source": [ "get_model_metadata()" ] }, { "cell_type": "markdown", "id": "1ed5f44e-612e-4b6d-8974-904fb3e058d6", "metadata": { "execution": { "iopub.execute_input": "2025-02-06T12:52:40.162173Z", "iopub.status.busy": "2025-02-06T12:52:40.159989Z", "iopub.status.idle": "2025-02-06T12:52:40.170634Z", "shell.execute_reply": "2025-02-06T12:52:40.169945Z", "shell.execute_reply.started": "2025-02-06T12:52:40.162124Z" } }, "source": [ "### Step 2 Consolidate Model-dataset CSV" ] }, { "cell_type": "code", "execution_count": 8, "id": "464d82f5-c24e-4b53-9b50-682fa4bf3430", "metadata": { "execution": { "iopub.execute_input": "2025-02-08T19:37:59.052245Z", "iopub.status.busy": "2025-02-08T19:37:59.051645Z", "iopub.status.idle": "2025-02-08T19:37:59.056017Z", "shell.execute_reply": "2025-02-08T19:37:59.055598Z", "shell.execute_reply.started": "2025-02-08T19:37:59.052224Z" } }, "outputs": [], "source": [ "## path thingy\n", "try: #scripts\n", " current_dir = Path(__file__).resolve().parent\n", "except NameError:\n", " # jupyter\n", " current_dir = Path.cwd()" ] }, { "cell_type": "code", "execution_count": null, "id": "14f3db4d-ef93-4629-8a27-971adb49248b", "metadata": { "execution": { "iopub.execute_input": "2025-02-08T19:37:59.443457Z", "iopub.status.busy": "2025-02-08T19:37:59.443236Z", "iopub.status.idle": "2025-02-08T19:37:59.890103Z", "shell.execute_reply": "2025-02-08T19:37:59.889571Z", "shell.execute_reply.started": "2025-02-08T19:37:59.443439Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing file: /shares/weddigen.ki.uzh/laura_wagner/Civitai_page_analysis/Civitai_visualizations/data/raw/model_metadata/newest_models_2.json\n", "Processing file: /shares/weddigen.ki.uzh/laura_wagner/Civitai_page_analysis/Civitai_visualizations/data/raw/model_metadata/newest_models_1.json\n", "Processing file: /shares/weddigen.ki.uzh/laura_wagner/Civitai_page_analysis/Civitai_visualizations/data/raw/model_metadata/newest_models_4.json\n", "Processing file: /shares/weddigen.ki.uzh/laura_wagner/Civitai_page_analysis/Civitai_visualizations/data/raw/model_metadata/newest_models_6.json\n", "Processing file: /shares/weddigen.ki.uzh/laura_wagner/Civitai_page_analysis/Civitai_visualizations/data/raw/model_metadata/newest_models_7.json\n", "Processing file: /shares/weddigen.ki.uzh/laura_wagner/Civitai_page_analysis/Civitai_visualizations/data/raw/model_metadata/newest_models_5.json\n", "Processing file: /shares/weddigen.ki.uzh/laura_wagner/Civitai_page_analysis/Civitai_visualizations/data/raw/model_metadata/newest_models_3.json\n" ] } ], "source": [ "import os\n", "import json\n", "import pandas as pd\n", "import hashlib\n", "from pathlib import Path\n", "from datetime import datetime, timezone\n", "\n", "\n", "\n", "\n", "def hash_username(username):\n", " return hashlib.sha256(username.encode('utf-8')).hexdigest()[:16]\n", "\n", "def parse_date(date_str):\n", " try:\n", " return datetime.fromisoformat(date_str.replace('Z', '+00:00'))\n", " except Exception:\n", " return datetime.min.replace(tzinfo=timezone.utc) # Make it timezone-aware\n", "\n", "def get_latest_model_version(model_versions):\n", " return max(model_versions, key=lambda mv: parse_date(mv.get('publishedAt', '')))\n", "\n", "def process_directory_recursively(root_dir):\n", " root_path = Path(root_dir)\n", " seen = {} # id -> (publishedAt, record)\n", " data_records = []\n", "\n", " for json_file in root_path.rglob('*.json'):\n", " if not json_file.is_file():\n", " continue\n", "\n", " #print(f\"Processing file: {json_file}\")\n", " try:\n", " with open(json_file, 'r', encoding='utf-8') as f:\n", " data = json.load(f)\n", " except Exception as e:\n", " print(f\"Failed to load {json_file}: {e}\")\n", " continue\n", "\n", " items = data.get('items') or data.get('data') or []\n", " for item in items:\n", " if not isinstance(item, dict):\n", " continue\n", "\n", " model_id = item.get('id')\n", " model_versions = item.get('modelVersions', [])\n", " if not model_versions:\n", " continue\n", "\n", " latest_version = get_latest_model_version(model_versions)\n", " published_at = latest_version.get('publishedAt', '')\n", " current_dt = parse_date(published_at)\n", "\n", " if model_id in seen and current_dt <= seen[model_id][0]:\n", " continue\n", " seen[model_id] = (current_dt, item)\n", "\n", " for model_id, (_, item) in seen.items():\n", " model_versions = item.get('modelVersions', [])\n", " latest_version = get_latest_model_version(model_versions)\n", " version_ids = [mv.get('id', '') for mv in model_versions[:20]]\n", "\n", " files = latest_version.get('files', [])\n", " auto_hashes = files[0].get('hashes', {}) if files else {}\n", " images = latest_version.get('images', [])\n", " first_image_url = images[0]['url'] if images else ''\n", " latest_image_url = images[-1]['url'] if images else ''\n", "\n", " username = item.get('creator', {}).get('username', '')\n", " record = {\n", " 'id': item.get('id', ''),\n", " 'name': item.get('name', ''),\n", " 'type': item.get('type', ''),\n", " 'baseModel': latest_version.get('baseModel', ''),\n", " 'downloadCount': item.get('stats', {}).get('downloadCount', 0),\n", " 'nsfwLevel': item.get('nsfwLevel', 0),\n", " 'modelVersions': len(model_versions),\n", " 'publishedAt': latest_version.get('publishedAt', ''),\n", " 'usernameHash': hash_username(username) if username else '',\n", " 'downloadUrl': latest_version.get('downloadUrl', ''),\n", " 'firstImageUrl': first_image_url,\n", " 'latestImageUrl': latest_image_url,\n", " 'poi': item.get('poi', False),\n", " 'AutoV1': auto_hashes.get('AutoV1', ''),\n", " 'AutoV2': auto_hashes.get('AutoV2', ''),\n", " 'AutoV3': auto_hashes.get('AutoV3', ''),\n", " 'SHA256': auto_hashes.get('SHA256', ''),\n", " 'CRC32': auto_hashes.get('CRC32', ''),\n", " 'BLAKE3': auto_hashes.get('BLAKE3', ''),\n", " 'previewImage': latest_image_url\n", " }\n", "\n", " for i in range(20):\n", " record[f'version_id_{i+1}'] = version_ids[i] if i < len(version_ids) else ''\n", "\n", " tags = item.get('tags', [])\n", " for i in range(7):\n", " record[f'tag_{i+1}'] = tags[i] if i < len(tags) else ''\n", "\n", " data_records.append(record)\n", "\n", " return pd.DataFrame(data_records)\n", "\n", "# Usage Example\n", "root_directory = current_dir.parent / 'data/raw/model_metadata/'\n", "df = process_directory_recursively(root_directory)\n", "df_sorted = df.sort_values(by='downloadCount', ascending=False)\n", "\n", "# Optionally save\n", "# df_sorted.to_csv('combined_metadata.csv', index=False)\n" ] }, { "cell_type": "markdown", "id": "2d6f822f-754f-4feb-9ed6-6f868421c454", "metadata": {}, "source": [ "### Save model-data to CSV" ] }, { "cell_type": "code", "execution_count": null, "id": "f861cf7b-5eb3-46ad-ab2d-9e2fdf2169b4", "metadata": { "execution": { "iopub.execute_input": "2025-02-08T19:38:02.047421Z", "iopub.status.busy": "2025-02-08T19:38:02.046761Z", "iopub.status.idle": "2025-02-08T19:38:02.193381Z", "shell.execute_reply": "2025-02-08T19:38:02.192886Z", "shell.execute_reply.started": "2025-02-08T19:38:02.047397Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data has been saved to /shares/weddigen.ki.uzh/laura_wagner/Civitai_page_analysis/Civitai_visualizations/data/CSV/Civiverse-Models.csv\n" ] } ], "source": [ "output_csv = current_dir.parent / 'data/CSV/Civiverse-Models_2025.csv'\n", "output_csv.parent.mkdir(parents=True, exist_ok=True)\n", "df_sorted.to_csv(output_csv, index=False)\n", "print(f\"Data has been saved to {output_csv}\")" ] }, { "cell_type": "markdown", "id": "9b948933-2a2f-42b0-9083-2d81586ae3f2", "metadata": { "execution": { "iopub.execute_input": "2025-02-06T13:42:32.447860Z", "iopub.status.busy": "2025-02-06T13:42:32.446832Z", "iopub.status.idle": "2025-02-06T13:42:32.453827Z", "shell.execute_reply": "2025-02-06T13:42:32.453271Z", "shell.execute_reply.started": "2025-02-06T13:42:32.447821Z" } }, "source": [ "### Step 3 Create Subsets: Checkpoint only, POI True, POI False" ] }, { "cell_type": "code", "execution_count": 11, "id": "ae894e26-4984-40e6-80a6-54fb6b61c873", "metadata": { "execution": { "iopub.execute_input": "2025-02-08T19:38:03.089160Z", "iopub.status.busy": "2025-02-08T19:38:03.088490Z", "iopub.status.idle": "2025-02-08T19:38:03.093067Z", "shell.execute_reply": "2025-02-08T19:38:03.092634Z", "shell.execute_reply.started": "2025-02-08T19:38:03.089138Z" } }, "outputs": [], "source": [ "file_path = current_dir.parent / 'data/CSV/Civiverse-Models.csv' # Update this with your actual file path\n", "(current_dir.parent / 'data/CSV/model_subsets').mkdir(parents=True, exist_ok=True)\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "88438f88-c723-423e-a4e5-e07f59096b72", "metadata": { "execution": { "iopub.execute_input": "2025-02-08T19:41:39.279495Z", "iopub.status.busy": "2025-02-08T19:41:39.279039Z", "iopub.status.idle": "2025-02-08T19:41:39.674445Z", "shell.execute_reply": "2025-02-08T19:41:39.673981Z", "shell.execute_reply.started": "2025-02-08T19:41:39.279476Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Files saved successfully!\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Load the dataset\n", "\n", "data = pd.read_csv(file_path)\n", "\n", "# Version 1: Only 'poi' true models\n", "poi_true_models = data[data['poi'] == True]\n", "\n", "# Version 2: Only types lora, dora, locon, textual inversion\n", "specific_types = ['LORA', 'DORA', 'LOCON', 'textualInversion']\n", "adapters = data[data['type'].isin(specific_types)]\n", "\n", "# Version 3: Only type checkpoint\n", "checkpoint_models = data[data['type'] == 'Checkpoint']\n", "\n", "# Version 4: All models apart from 'poi' true\n", "non_poi_models = data[data['poi'] != True]\n", "\n", "# Version 5: All models apart from 'poi' true and with nsfwLevel below 13\n", "non_poi_low_nsfw_models = data[(data['poi'] != True) & (data['nsfwLevel'] < 13)]\n", "\n", "# Save the versions as separate CSV files\n", "poi_true_models.to_csv(current_dir.parent / 'data/CSV/model_subsets/Civiverse_adapters_poi_true.csv', index=False)\n", "adapters.to_csv(current_dir.parent / 'data/CSV/adapters.csv', index=False)\n", "checkpoint_models.to_csv(current_dir.parent / 'data/CSV/model_subsets/Civiverse_checkpoint_only.csv', index=False)\n", "non_poi_models.to_csv(current_dir.parent / 'data/CSV/model_subsets/Civiverse_adapters_poi_false.csv', index=False)\n", "\n", "print(\"Files saved successfully!\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "49e35088-8b2e-4189-83e1-3098d55dcad2", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "\n", "# Load the dataset\n", "df = pd.read_csv('data/all_models_with_tags.csv')\n", "\n", "# Filter for rows where poi is True\n", "filtered_df = df[df['poi'] == True]\n", "os.makedirs('data/model_subsets', exist_ok=True)\n", "\n", "# Save the filtered DataFrame to a new CSV file\n", "filtered_df.to_csv('data/model_subsets/all_models_poi.csv', index=False)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "06c15f2c", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "\n", "# Load the dataset\n", "df = pd.read_csv('data/all_models_with_tags.csv')\n", "\n", "# Filter for rows where poi is True\n", "filtered_df = df[df['poi'] == False]\n", "os.makedirs('data/model_subsets', exist_ok=True)\n", "\n", "# Save the filtered DataFrame to a new CSV file\n", "filtered_df.to_csv('data/model_subsets/all_models_poi_false.csv', index=False)\n" ] } ], "metadata": { "kernelspec": { "display_name": "latm", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 5 }