Spaces:

mametarow
/

booth-pic-api

Running

booth-pic-api / backend /scripts /seed_data.py

github-actions

Deploy to HF (clean history with LFS)

a06f06c 19 days ago

5.12 kB

	import asyncio
	import os
	import uuid
	from prisma import Prisma
	from qdrant_client import QdrantClient
	from qdrant_client.http.models import Distance, VectorParams, PointStruct
	from PIL import Image
	import requests
	import io
	import torch
	from transformers import CLIPProcessor, CLIPModel

	# Configuration
	QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
	COLLECTION_NAME = "booth_items"
	HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}

	# Sample Data
	SAMPLE_ITEMS = [
	{
	"title": "幽狐族の娘「桔梗」専用【3D衣装モデル】Royal Dress",
	"price": 2000,
	"shopName": "Mame-Shop",
	"boothUrl": "https://booth.pm/ja/items/1234567",
	"thumbnailUrl": "https://images.booth.pm/c/cc495213-9799-4d69-90bc-2c70034a7429/18a29a43-6c7e-4b72-9e8d-8a5840d892d1/thumbnail_600x600.png"
	},
	{
	"title": "【萌専用】ゴスロリメイド服",
	"price": 1800,
	"shopName": "Alice-Atelier",
	"boothUrl": "https://booth.pm/ja/items/2345678",
	"thumbnailUrl": "https://images.booth.pm/c/7951d3b4-4b52-4e8a-8a58-8a8b1c1d1e1f/1a2b3c4d-5e6f-7a8b-9c0d-1e1f2a3b4c5d/thumbnail_600x600.png"
	}
	]

	async def seed():
	prisma = Prisma()
	await prisma.connect()

	# Local mode: no server needed
	qdrant = QdrantClient(path="qdrant_local")

	# Initialize CLIP model for embedding generation
	print("Loading CLIP model...")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
	processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

	# Ensure Qdrant collection
	print(f"Ensuring Qdrant collection: {COLLECTION_NAME}")
	collections = qdrant.get_collections()
	if not any(c.name == COLLECTION_NAME for c in collections.collections):
	qdrant.create_collection(
	collection_name=COLLECTION_NAME,
	vectors_config=VectorParams(size=512, distance=Distance.COSINE),
	)

	for item in SAMPLE_ITEMS:
	print(f"Processing: {item['title']}")

	# 1. Download image and generate embedding
	try:
	response = requests.get(item['thumbnailUrl'], headers=HEADERS, timeout=10)
	response.raise_for_status()
	image = Image.open(io.BytesIO(response.content)).convert("RGB")

	inputs = processor(images=image, return_tensors="pt").to(device)
	with torch.no_grad():
	outputs = model.get_image_features(**inputs)

	# Robustly handle different CLIP output formats
	if hasattr(outputs, "image_embeds"):
	features = outputs.image_embeds
	else:
	features = outputs

	# Normalize and convert to list
	features = features / features.norm(p=2, dim=-1, keepdim=True)
	vector = features.cpu().numpy()[0].tolist()

	# 2. Save to PostgreSQL via Prisma
	# First, ensure shop exists
	shop = await prisma.shop.upsert(
	where={'url': f"https://{item['shopName'].lower()}.booth.pm"},
	data={
	'create': {
	'name': item['shopName'],
	'url': f"https://{item['shopName'].lower()}.booth.pm"
	},
	'update': {'name': item['shopName']}
	}
	)

	# Create product
	product = await prisma.product.create(
	data={
	'shopId': shop.id,
	'title': item['title'],
	'price': item['price'],
	'thumbnailUrl': item['thumbnailUrl']
	}
	)

	# 3. Save to Qdrant
	vector_id = str(uuid.uuid4())
	qdrant.upsert(
	collection_name=COLLECTION_NAME,
	points=[
	PointStruct(
	id=vector_id,
	vector=vector,
	payload={
	"productId": product.id,
	"title": item['title'],
	"price": item['price'],
	"shopName": item['shopName'],
	"boothUrl": item['boothUrl'],
	"thumbnailUrl": item['thumbnailUrl']
	}
	)
	]
	)

	# Link vectorId back to DB image if we were storing images specifically
	# For MVP, we use the vector payload for display

	print(f"Successfully seeded: {item['title']}")

	except Exception as e:
	print(f"Error seeding {item['title']}: {e}")

	await prisma.disconnect()
	print("Seeding complete.")

	if __name__ == "__main__":
	asyncio.run(seed())