Create create_crop_db.py
Browse files- crop_db/create_crop_db.py +42 -0
crop_db/create_crop_db.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import chromadb
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
|
| 4 |
+
|
| 5 |
+
# Initialize ChromaDB
|
| 6 |
+
client = chromadb.PersistentClient(path="crop_db")
|
| 7 |
+
embedder = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
|
| 8 |
+
|
| 9 |
+
# Create collection with metadata
|
| 10 |
+
collection = client.get_or_create_collection(
|
| 11 |
+
name="crop_data",
|
| 12 |
+
embedding_function=embedder,
|
| 13 |
+
metadata={"hnsw:space": "cosine"}
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
# Load crop optimization dataset
|
| 17 |
+
dataset = load_dataset("DARJYO/sawotiQ29_crop_optimization")
|
| 18 |
+
|
| 19 |
+
# Prepare documents and metadata
|
| 20 |
+
documents = []
|
| 21 |
+
metadatas = []
|
| 22 |
+
ids = []
|
| 23 |
+
|
| 24 |
+
for idx, item in enumerate(dataset['train']):
|
| 25 |
+
doc = f"{item['crop_name']} - {item['region']}. Optimal conditions: {item['optimal_temperature']}°C, {item['annual_rainfall']}mm rainfall. Soil: {item['preferred_soil']}. Yield: {item['average_yield']}"
|
| 26 |
+
|
| 27 |
+
documents.append(doc)
|
| 28 |
+
metadatas.append({
|
| 29 |
+
"type": "crop",
|
| 30 |
+
"region": item['region'],
|
| 31 |
+
"season": item['best_season']
|
| 32 |
+
})
|
| 33 |
+
ids.append(str(idx))
|
| 34 |
+
|
| 35 |
+
# Add to collection
|
| 36 |
+
collection.add(
|
| 37 |
+
documents=documents,
|
| 38 |
+
metadatas=metadatas,
|
| 39 |
+
ids=ids
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
print(f"Created crop database with {len(dataset['train'])} entries")
|