import chromadb from datasets import load_dataset from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction # Initialize ChromaDB client = chromadb.PersistentClient(path="crop_db") embedder = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2") # Create collection with metadata collection = client.get_or_create_collection( name="crop_data", embedding_function=embedder, metadata={"hnsw:space": "cosine"} ) # Load crop optimization dataset dataset = load_dataset("DARJYO/sawotiQ29_crop_optimization") # Prepare documents and metadata documents = [] metadatas = [] ids = [] for idx, item in enumerate(dataset['train']): doc = f"{item['crop_name']} - {item['region']}. Optimal conditions: {item['optimal_temperature']}°C, {item['annual_rainfall']}mm rainfall. Soil: {item['preferred_soil']}. Yield: {item['average_yield']}" documents.append(doc) metadatas.append({ "type": "crop", "region": item['region'], "season": item['best_season'] }) ids.append(str(idx)) # Add to collection collection.add( documents=documents, metadatas=metadatas, ids=ids ) print(f"Created crop database with {len(dataset['train'])} entries")