Spaces:
Sleeping
Sleeping
File size: 6,115 Bytes
7cae5d6 8cc9517 7cae5d6 8cc9517 7cae5d6 719390c 8cc9517 3d40769 719390c 8cc9517 3d40769 8cc9517 719390c 8cc9517 3d40769 719390c 3d40769 719390c 8cc9517 3d40769 719390c 8cc9517 7cae5d6 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 7cae5d6 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c 8cc9517 719390c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import os
import sqlite3
import json
from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType
import google.generativeai as genai
from tools.tool_registry import get_all_tools
# --- Configuration for persistence paths ---
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data"))
SQLITE_DB_PATH = os.path.join(DATA_DIR, "tools.metadata.db")
MILVUS_DATA_PATH = os.path.join(DATA_DIR, "milvus_lite.db")
# --- Model and DB Configuration ---
EMBEDDING_DIM = 3072
EMBEDDING_MODEL_NAME = "gemini-embedding-exp-03-07"
MILVUS_COLLECTION_NAME = "tool_embeddings"
def initialize_system():
"""
The main system initialization function.
It creates directories, sets up the database and vector store, and loads tools.
This function is designed to be idempotent.
"""
print("--- Starting System Initialization (Final Version) ---")
os.makedirs(DATA_DIR, exist_ok=True)
# --- Correct Initialization Order ---
# 1. Initialize SQLite and sync tool metadata
# Ensures SQLite always has the latest tool information
_init_sqlite_db()
all_tools_definitions = get_all_tools()
_sync_tools_to_sqlite(all_tools_definitions)
# 2. Initialize Milvus and sync vector embeddings
# It reads data from the already populated SQLite DB
milvus_client = _init_milvus_and_sync_embeddings()
# 3. Create the tool recommender instance
from core.tool_recommender import DirectToolRecommender
tool_recommender = DirectToolRecommender(
milvus_client=milvus_client, sqlite_db_path=SQLITE_DB_PATH
)
print("--- System Initialization Complete ---")
return all_tools_definitions, tool_recommender
def _init_sqlite_db():
"""Initializes the SQLite database and creates the tools table if it doesn't exist."""
print(f"SQLite DB Path: {SQLITE_DB_PATH}")
with sqlite3.connect(SQLITE_DB_PATH) as conn:
cursor = conn.cursor()
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS tools (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT UNIQUE NOT NULL,
description TEXT NOT NULL,
parameters TEXT NOT NULL
)
"""
)
conn.commit()
print("SQLite DB table verified.")
def _sync_tools_to_sqlite(tools_definitions):
"""Syncs tool definitions into the SQLite database."""
print("Syncing tool metadata to SQLite...")
with sqlite3.connect(SQLITE_DB_PATH) as conn:
cursor = conn.cursor()
for tool in tools_definitions:
cursor.execute("SELECT id FROM tools WHERE name = ?", (tool.name,))
if cursor.fetchone() is None:
cursor.execute(
"INSERT INTO tools (name, description, parameters) VALUES (?, ?, ?)",
(tool.name, tool.description, json.dumps(tool.args)),
)
print(f" - Added new tool to SQLite: {tool.name}")
conn.commit()
print("SQLite sync complete.")
def _init_milvus_and_sync_embeddings():
"""Initializes Milvus Lite, rebuilds the collection, and syncs embeddings."""
print(f"Milvus Lite Data Path: {MILVUS_DATA_PATH}")
client = MilvusClient(uri=MILVUS_DATA_PATH)
# Recreate the collection on every startup to ensure correct dimensionality and fresh data for the demo.
if client.has_collection(collection_name=MILVUS_COLLECTION_NAME):
client.drop_collection(collection_name=MILVUS_COLLECTION_NAME)
print("Found old Milvus collection. Dropped it to rebuild.")
print(
f"Creating Milvus collection '{MILVUS_COLLECTION_NAME}' with dimension {EMBEDDING_DIM}..."
)
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIM),
]
schema = CollectionSchema(fields)
client.create_collection(collection_name=MILVUS_COLLECTION_NAME, schema=schema)
index_params = client.prepare_index_params()
index_params.add_index(
field_name="embedding", index_type="AUTOINDEX", metric_type="L2"
)
client.create_index(
collection_name=MILVUS_COLLECTION_NAME, index_params=index_params
)
print("Milvus collection and index created successfully.")
# Critical Step: Now we sync the embeddings to the newly created collection
_sync_tool_embeddings_to_milvus(client)
client.load_collection(collection_name=MILVUS_COLLECTION_NAME)
return client
def _sync_tool_embeddings_to_milvus(milvus_client):
"""Generates and syncs tool description embeddings to Milvus Lite."""
print("Syncing tool embeddings to Milvus...")
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
print("Error: GEMINI_API_KEY not found.")
return
genai.configure(api_key=api_key)
with sqlite3.connect(SQLITE_DB_PATH) as conn:
cursor = conn.cursor()
cursor.execute("SELECT id, description FROM tools")
all_tools_in_db = cursor.fetchall()
if not all_tools_in_db:
print("Error: No tools found in SQLite to sync.")
return
print(f"Found {len(all_tools_in_db)} tools from SQLite, generating embeddings...")
docs_to_embed = [tool[1] for tool in all_tools_in_db]
print(f"Using embedding model: {EMBEDDING_MODEL_NAME}")
result = genai.embed_content(
model=EMBEDDING_MODEL_NAME,
content=docs_to_embed,
task_type="retrieval_document",
)
embeddings = result["embedding"]
tool_ids_to_insert = [tool[0] for tool in all_tools_in_db]
data_to_insert = [
{"id": tool_id, "embedding": embedding}
for tool_id, embedding in zip(tool_ids_to_insert, embeddings)
]
milvus_client.insert(collection_name=MILVUS_COLLECTION_NAME, data=data_to_insert)
print(f"Successfully inserted {len(data_to_insert)} new embeddings into Milvus.")
|