import asyncio import json from cbh.core.config import settings from langchain_core.prompts import ChatPromptTemplate from cbh.api.platforms.models import PlatformModel system_prompt = """You are a data structuring assistant. Your task is to convert raw AI tool data into a strictly typed structured format. Given the raw tool data below, return a single JSON object matching the PlatformModel schema exactly. Rules: - "name": extract the tool name as-is. - "category": map to an integer enum: 1=Web apps/SaaS/MVP, 2=Websites/Landing pages, 3=Mobile apps, 4=UI/UX Design, 5=AI Coding tools, 6=Automation/AI agents, 7=Video/Creative, 8=SEO/GEO, 9=Growth/Social/Reddit, 10=Research/Analytics. - "subcategory": use the subcategory string as-is. - "oneLinePos": use the "One-line positioning" value. - "description": use the "Detailed description" value. - "userQueries": split "User query covered" into a list of distinct user intents/queries. If there is only one, return a single-element list. - "idealCases": use the "Best if client wants" value. - "personas": split "Recommended persona" by commas into a list of individual personas. - "level": map skill level to an integer enum: 1=Low, 2=Low-to-Medium, 3=Medium, 4=Medium-to-High, 5=High. - "toolType": map to an integer enum: 1=No-code, 2=Hybrid, 3=Dev. - "focus": map "Platform focus" to a list of integer enums: 1=Web, 2=Mobile, 3=Desktop, 4=Multi-platform, 5=Mobile design, 6=Developer workflow, 7=Desktop/Multi-platform dev. - "productStage": split "Best product stage" by commas into a list. - "keyStrengths": split "Key strengths" by semicolons into a list. Trim whitespace. - "caveats": split "Main caveats" by semicolons into a list. If only one caveat, return a single-element list. Trim whitespace. - "monetizationPriority": map to an integer enum: 1=Low, 2=Medium, 3=High. - "website": use the "Website" URL as-is. - "internalNotes": use the "Internal notes" value. Raw tool data: {raw_data}""" async def upload_data(item: dict) -> PlatformModel: prompt = ChatPromptTemplate.from_messages([ ("system", system_prompt), ]) chain = prompt | settings.get_llm(model="gpt-5.4", schema=PlatformModel) result = await chain.ainvoke({"raw_data": json.dumps(item, ensure_ascii=False)}) print(f"Processed: {result.name}") return result async def main(): with open("ai_tools.json", "r") as f: data = json.load(f) results = [] batch_size = 10 for i in range(0, len(data), batch_size): batch = data[i:i+batch_size] platforms = await asyncio.gather(*[upload_data(item) for item in batch]) results.extend(platforms) await settings.DB_CLIENT.platforms.insert_many([platform.to_mongo() for platform in results]) if __name__ == "__main__": asyncio.run(main())