Croptimize / crop_db /create_crop_db.py
persadian's picture
Create create_crop_db.py
90f1840 verified
import chromadb
from datasets import load_dataset
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
# Initialize ChromaDB
client = chromadb.PersistentClient(path="crop_db")
embedder = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
# Create collection with metadata
collection = client.get_or_create_collection(
name="crop_data",
embedding_function=embedder,
metadata={"hnsw:space": "cosine"}
)
# Load crop optimization dataset
dataset = load_dataset("DARJYO/sawotiQ29_crop_optimization")
# Prepare documents and metadata
documents = []
metadatas = []
ids = []
for idx, item in enumerate(dataset['train']):
doc = f"{item['crop_name']} - {item['region']}. Optimal conditions: {item['optimal_temperature']}°C, {item['annual_rainfall']}mm rainfall. Soil: {item['preferred_soil']}. Yield: {item['average_yield']}"
documents.append(doc)
metadatas.append({
"type": "crop",
"region": item['region'],
"season": item['best_season']
})
ids.append(str(idx))
# Add to collection
collection.add(
documents=documents,
metadatas=metadatas,
ids=ids
)
print(f"Created crop database with {len(dataset['train'])} entries")