staffily / migrations /upload_data.py
brestok's picture
Add data migration script for platform pricing updates
7214a34
import asyncio
import json
from cbh.core.config import settings
from langchain_core.prompts import ChatPromptTemplate
from cbh.api.platforms.models import PlatformModel
system_prompt = """You are a data structuring assistant. Your task is to convert raw AI tool data into a strictly typed structured format.
Given the raw tool data below, return a single JSON object matching the PlatformModel schema exactly.
Rules:
- "name": extract the tool name as-is.
- "category": map to an integer enum:
1=Web apps/SaaS/MVP, 2=Websites/Landing pages, 3=Mobile apps, 4=UI/UX Design,
5=AI Coding tools, 6=Automation/AI agents, 7=Video/Creative, 8=SEO/GEO,
9=Growth/Social/Reddit, 10=Research/Analytics.
- "subcategory": use the subcategory string as-is.
- "oneLinePos": use the "One-line positioning" value.
- "description": use the "Detailed description" value.
- "userQueries": split "User query covered" into a list of distinct user intents/queries. If there is only one, return a single-element list.
- "idealCases": use the "Best if client wants" value.
- "personas": split "Recommended persona" by commas into a list of individual personas.
- "level": map skill level to an integer enum: 1=Low, 2=Low-to-Medium, 3=Medium, 4=Medium-to-High, 5=High.
- "toolType": map to an integer enum: 1=No-code, 2=Hybrid, 3=Dev.
- "focus": map "Platform focus" to a list of integer enums: 1=Web, 2=Mobile, 3=Desktop, 4=Multi-platform, 5=Mobile design, 6=Developer workflow, 7=Desktop/Multi-platform dev.
- "productStage": split "Best product stage" by commas into a list.
- "keyStrengths": split "Key strengths" by semicolons into a list. Trim whitespace.
- "caveats": split "Main caveats" by semicolons into a list. If only one caveat, return a single-element list. Trim whitespace.
- "monetizationPriority": map to an integer enum: 1=Low, 2=Medium, 3=High.
- "website": use the "Website" URL as-is.
- "internalNotes": use the "Internal notes" value.
Raw tool data:
{raw_data}"""
async def upload_data(item: dict) -> PlatformModel:
prompt = ChatPromptTemplate.from_messages([
("system", system_prompt),
])
chain = prompt | settings.get_llm(model="gpt-5.4", schema=PlatformModel)
result = await chain.ainvoke({"raw_data": json.dumps(item, ensure_ascii=False)})
print(f"Processed: {result.name}")
return result
async def main():
with open("ai_tools.json", "r") as f:
data = json.load(f)
results = []
batch_size = 10
for i in range(0, len(data), batch_size):
batch = data[i:i+batch_size]
platforms = await asyncio.gather(*[upload_data(item) for item in batch])
results.extend(platforms)
await settings.DB_CLIENT.platforms.insert_many([platform.to_mongo() for platform in results])
if __name__ == "__main__":
asyncio.run(main())