Spaces:

brestok
/

staffily

Running

App Files Files Community

staffily / migrations /upload_data.py

brestok

Add data migration script for platform pricing updates

7214a34 about 1 month ago

raw

history blame contribute delete

2.81 kB

	import asyncio
	import json
	from cbh.core.config import settings
	from langchain_core.prompts import ChatPromptTemplate
	from cbh.api.platforms.models import PlatformModel

	system_prompt = """You are a data structuring assistant. Your task is to convert raw AI tool data into a strictly typed structured format.

	Given the raw tool data below, return a single JSON object matching the PlatformModel schema exactly.

	Rules:
	- "name": extract the tool name as-is.
	- "category": map to an integer enum:
	1=Web apps/SaaS/MVP, 2=Websites/Landing pages, 3=Mobile apps, 4=UI/UX Design,
	5=AI Coding tools, 6=Automation/AI agents, 7=Video/Creative, 8=SEO/GEO,
	9=Growth/Social/Reddit, 10=Research/Analytics.
	- "subcategory": use the subcategory string as-is.
	- "oneLinePos": use the "One-line positioning" value.
	- "description": use the "Detailed description" value.
	- "userQueries": split "User query covered" into a list of distinct user intents/queries. If there is only one, return a single-element list.
	- "idealCases": use the "Best if client wants" value.
	- "personas": split "Recommended persona" by commas into a list of individual personas.
	- "level": map skill level to an integer enum: 1=Low, 2=Low-to-Medium, 3=Medium, 4=Medium-to-High, 5=High.
	- "toolType": map to an integer enum: 1=No-code, 2=Hybrid, 3=Dev.
	- "focus": map "Platform focus" to a list of integer enums: 1=Web, 2=Mobile, 3=Desktop, 4=Multi-platform, 5=Mobile design, 6=Developer workflow, 7=Desktop/Multi-platform dev.
	- "productStage": split "Best product stage" by commas into a list.
	- "keyStrengths": split "Key strengths" by semicolons into a list. Trim whitespace.
	- "caveats": split "Main caveats" by semicolons into a list. If only one caveat, return a single-element list. Trim whitespace.
	- "monetizationPriority": map to an integer enum: 1=Low, 2=Medium, 3=High.
	- "website": use the "Website" URL as-is.
	- "internalNotes": use the "Internal notes" value.

	Raw tool data:
	{raw_data}"""

	async def upload_data(item: dict) -> PlatformModel:
	prompt = ChatPromptTemplate.from_messages([
	("system", system_prompt),
	])
	chain = prompt \| settings.get_llm(model="gpt-5.4", schema=PlatformModel)
	result = await chain.ainvoke({"raw_data": json.dumps(item, ensure_ascii=False)})
	print(f"Processed: {result.name}")
	return result


	async def main():
	with open("ai_tools.json", "r") as f:
	data = json.load(f)

	results = []
	batch_size = 10
	for i in range(0, len(data), batch_size):
	batch = data[i:i+batch_size]
	platforms = await asyncio.gather(*[upload_data(item) for item in batch])
	results.extend(platforms)

	await settings.DB_CLIENT.platforms.insert_many([platform.to_mongo() for platform in results])


	if __name__ == "__main__":
	asyncio.run(main())