| import asyncio |
| import json |
| from cbh.core.config import settings |
| from langchain_core.prompts import ChatPromptTemplate |
| from cbh.api.platforms.models import PlatformModel |
|
|
| system_prompt = """You are a data structuring assistant. Your task is to convert raw AI tool data into a strictly typed structured format. |
| |
| Given the raw tool data below, return a single JSON object matching the PlatformModel schema exactly. |
| |
| Rules: |
| - "name": extract the tool name as-is. |
| - "category": map to an integer enum: |
| 1=Web apps/SaaS/MVP, 2=Websites/Landing pages, 3=Mobile apps, 4=UI/UX Design, |
| 5=AI Coding tools, 6=Automation/AI agents, 7=Video/Creative, 8=SEO/GEO, |
| 9=Growth/Social/Reddit, 10=Research/Analytics. |
| - "subcategory": use the subcategory string as-is. |
| - "oneLinePos": use the "One-line positioning" value. |
| - "description": use the "Detailed description" value. |
| - "userQueries": split "User query covered" into a list of distinct user intents/queries. If there is only one, return a single-element list. |
| - "idealCases": use the "Best if client wants" value. |
| - "personas": split "Recommended persona" by commas into a list of individual personas. |
| - "level": map skill level to an integer enum: 1=Low, 2=Low-to-Medium, 3=Medium, 4=Medium-to-High, 5=High. |
| - "toolType": map to an integer enum: 1=No-code, 2=Hybrid, 3=Dev. |
| - "focus": map "Platform focus" to a list of integer enums: 1=Web, 2=Mobile, 3=Desktop, 4=Multi-platform, 5=Mobile design, 6=Developer workflow, 7=Desktop/Multi-platform dev. |
| - "productStage": split "Best product stage" by commas into a list. |
| - "keyStrengths": split "Key strengths" by semicolons into a list. Trim whitespace. |
| - "caveats": split "Main caveats" by semicolons into a list. If only one caveat, return a single-element list. Trim whitespace. |
| - "monetizationPriority": map to an integer enum: 1=Low, 2=Medium, 3=High. |
| - "website": use the "Website" URL as-is. |
| - "internalNotes": use the "Internal notes" value. |
| |
| Raw tool data: |
| {raw_data}""" |
|
|
| async def upload_data(item: dict) -> PlatformModel: |
| prompt = ChatPromptTemplate.from_messages([ |
| ("system", system_prompt), |
| ]) |
| chain = prompt | settings.get_llm(model="gpt-5.4", schema=PlatformModel) |
| result = await chain.ainvoke({"raw_data": json.dumps(item, ensure_ascii=False)}) |
| print(f"Processed: {result.name}") |
| return result |
|
|
|
|
| async def main(): |
| with open("ai_tools.json", "r") as f: |
| data = json.load(f) |
|
|
| results = [] |
| batch_size = 10 |
| for i in range(0, len(data), batch_size): |
| batch = data[i:i+batch_size] |
| platforms = await asyncio.gather(*[upload_data(item) for item in batch]) |
| results.extend(platforms) |
|
|
| await settings.DB_CLIENT.platforms.insert_many([platform.to_mongo() for platform in results]) |
|
|
|
|
| if __name__ == "__main__": |
| asyncio.run(main()) |
|
|