Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -25,56 +25,62 @@ class SetupRequest(BaseModel):
|
|
| 25 |
# -------------------------------
|
| 26 |
# Persistent Chroma client
|
| 27 |
# -------------------------------
|
|
|
|
| 28 |
COLLECTION_NAME = "knowledge_base"
|
| 29 |
|
| 30 |
-
# Initialize in-memory client
|
| 31 |
-
chroma_client = chromadb.Client()
|
| 32 |
-
|
| 33 |
-
# Create or get collection
|
| 34 |
-
kb_collection: Collection = chroma_client.get_or_create_collection(COLLECTION_NAME)
|
| 35 |
-
|
| 36 |
-
print("✅ In-memory Knowledge Base initialized successfully.")
|
| 37 |
-
|
| 38 |
# -------------------------------
|
| 39 |
# KB Setup Endpoint
|
| 40 |
# -------------------------------
|
| 41 |
@app.post("/setup")
|
| 42 |
-
|
| 43 |
"""
|
| 44 |
-
|
| 45 |
-
|
| 46 |
"""
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
raise HTTPException(status_code=500, detail="Chroma KB not initialized")
|
| 50 |
|
| 51 |
try:
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
except Exception as e:
|
| 55 |
-
raise HTTPException(status_code=
|
| 56 |
-
|
| 57 |
-
if "knowledge_base" not in kb_data:
|
| 58 |
-
raise HTTPException(status_code=400, detail="Invalid KB format, missing 'knowledge_base' key")
|
| 59 |
-
|
| 60 |
-
# Clear existing entries before adding new KB
|
| 61 |
-
kb_collection.delete(where={"id": {"$ne": None}}) # Delete all previous entries
|
| 62 |
-
|
| 63 |
-
# Add KB entries
|
| 64 |
-
for entry in kb_data["knowledge_base"]:
|
| 65 |
-
kb_collection.add(
|
| 66 |
-
documents=[entry["answer"]],
|
| 67 |
-
metadatas=[{
|
| 68 |
-
"id": entry["id"],
|
| 69 |
-
"category": entry.get("category", ""),
|
| 70 |
-
"question_variations": entry.get("question_variations", []),
|
| 71 |
-
"keywords": entry.get("keywords", [])
|
| 72 |
-
}],
|
| 73 |
-
ids=[entry["id"]]
|
| 74 |
-
)
|
| 75 |
-
|
| 76 |
-
kb_collection.persist()
|
| 77 |
-
return {"status": f"KB uploaded and stored successfully, {len(kb_data['knowledge_base'])} entries added"}
|
| 78 |
|
| 79 |
# -------------------------------
|
| 80 |
# Step-by-Step Endpoints
|
|
|
|
| 25 |
# -------------------------------
|
| 26 |
# Persistent Chroma client
|
| 27 |
# -------------------------------
|
| 28 |
+
CHROMA_PATH = "/data/chroma"
|
| 29 |
COLLECTION_NAME = "knowledge_base"
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
# -------------------------------
|
| 32 |
# KB Setup Endpoint
|
| 33 |
# -------------------------------
|
| 34 |
@app.post("/setup")
|
| 35 |
+
def setup_kb(json_file_path: str):
|
| 36 |
"""
|
| 37 |
+
Loads a JSON file, generates embeddings, and populates a persistent ChromaDB knowledge base.
|
| 38 |
+
Only runs when explicitly called.
|
| 39 |
"""
|
| 40 |
+
if not os.path.exists(json_file_path):
|
| 41 |
+
raise HTTPException(status_code=404, detail=f"File not found: {json_file_path}")
|
|
|
|
| 42 |
|
| 43 |
try:
|
| 44 |
+
# Load JSON data
|
| 45 |
+
with open(json_file_path, "r", encoding="utf-8") as f:
|
| 46 |
+
data = json.load(f)
|
| 47 |
+
|
| 48 |
+
if not isinstance(data, list):
|
| 49 |
+
raise HTTPException(status_code=400, detail="JSON must contain a list of knowledge items.")
|
| 50 |
+
|
| 51 |
+
print(f"📘 Loaded {len(data)} items from {json_file_path}")
|
| 52 |
+
|
| 53 |
+
# Initialize encoder and Chroma client (only here)
|
| 54 |
+
encoder = SentenceTransformer("all-MiniLM-L6-v2")
|
| 55 |
+
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
|
| 56 |
+
collection = chroma_client.get_or_create_collection(COLLECTION_NAME)
|
| 57 |
+
|
| 58 |
+
# Clear existing data (optional)
|
| 59 |
+
if collection.count() > 0:
|
| 60 |
+
print(f"🧹 Clearing {collection.count()} existing records...")
|
| 61 |
+
collection.delete(ids=collection.get()['ids'])
|
| 62 |
+
|
| 63 |
+
# Prepare data for embeddings
|
| 64 |
+
texts, ids, metadatas = [], [], []
|
| 65 |
+
for i, item in enumerate(data):
|
| 66 |
+
content = item.get("content") or item.get("text") or ""
|
| 67 |
+
title = item.get("title") or f"Document {i+1}"
|
| 68 |
+
texts.append(f"{title}. {content}")
|
| 69 |
+
ids.append(str(i))
|
| 70 |
+
metadatas.append(item)
|
| 71 |
+
|
| 72 |
+
print("🧠 Generating embeddings...")
|
| 73 |
+
embeddings = encoder.encode(texts, show_progress_bar=True).tolist()
|
| 74 |
+
|
| 75 |
+
# Add to ChromaDB
|
| 76 |
+
print("💾 Adding to ChromaDB...")
|
| 77 |
+
collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas)
|
| 78 |
+
print(f"✅ Successfully added {collection.count()} records to {COLLECTION_NAME}.")
|
| 79 |
+
|
| 80 |
+
return {"message": "Knowledge base successfully initialized.", "count": collection.count()}
|
| 81 |
+
|
| 82 |
except Exception as e:
|
| 83 |
+
raise HTTPException(status_code=500, detail=f"Setup failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
# -------------------------------
|
| 86 |
# Step-by-Step Endpoints
|