|
|
import chromadb |
|
|
from datasets import load_dataset |
|
|
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction |
|
|
|
|
|
|
|
|
client = chromadb.PersistentClient(path="crop_db") |
|
|
embedder = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
collection = client.get_or_create_collection( |
|
|
name="crop_data", |
|
|
embedding_function=embedder, |
|
|
metadata={"hnsw:space": "cosine"} |
|
|
) |
|
|
|
|
|
|
|
|
dataset = load_dataset("DARJYO/sawotiQ29_crop_optimization") |
|
|
|
|
|
|
|
|
documents = [] |
|
|
metadatas = [] |
|
|
ids = [] |
|
|
|
|
|
for idx, item in enumerate(dataset['train']): |
|
|
doc = f"{item['crop_name']} - {item['region']}. Optimal conditions: {item['optimal_temperature']}°C, {item['annual_rainfall']}mm rainfall. Soil: {item['preferred_soil']}. Yield: {item['average_yield']}" |
|
|
|
|
|
documents.append(doc) |
|
|
metadatas.append({ |
|
|
"type": "crop", |
|
|
"region": item['region'], |
|
|
"season": item['best_season'] |
|
|
}) |
|
|
ids.append(str(idx)) |
|
|
|
|
|
|
|
|
collection.add( |
|
|
documents=documents, |
|
|
metadatas=metadatas, |
|
|
ids=ids |
|
|
) |
|
|
|
|
|
print(f"Created crop database with {len(dataset['train'])} entries") |