leedami's picture
Deploy from Team Script
41cc6f7 verified
import os
import json
import chromadb
import numpy as np
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import torch
import uvicorn
# [Nyang V4 3D Visualizer Server] 🦁🌌
# 34.6만 개의 지식 곡간을 3D둜 μ‹€μ‹œκ°„ νˆ¬μ˜ν•©λ‹ˆλ‹€.
app = FastAPI()
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
# μ„€μ •
DB_PATH = r"v4_advanced_rag/data/vector_db_v4"
MODEL_NAME = "nlpai-lab/KURE-v1"
COLLECTION_NAME = "nyang_ultimate_knowledge"
SAMPLE_SIZE = 50000 # μ§€λ°°μΈλ‹˜μ˜ μš”μ²­λŒ€λ‘œ λŒ€ν­ ν™•λŒ€ (λΈŒλΌμš°μ € ν•œκ³„ κ³ λ €)
class NyangVisualizer:
def __init__(self):
print("βš™οΈ Loading Model and DB for Visualization...")
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = SentenceTransformer(MODEL_NAME, device=self.device)
if self.device == "cuda": self.model.half()
self.client = chromadb.PersistentClient(path=DB_PATH)
self.collection = self.client.get_collection(name=COLLECTION_NAME)
# 1. κ³ μ •λœ λ°°κ²½ 데이터(Starfield) μ€€λΉ„
print(f"🌌 Fetching {SAMPLE_SIZE} samples from DB...")
results = self.collection.get(include=['embeddings', 'metadatas', 'documents'], limit=SAMPLE_SIZE)
self.base_embeddings = np.array(results['embeddings'])
self.base_metas = results['metadatas']
self.base_docs = results['documents']
# 2. PCA λͺ¨λΈ ν•™μŠ΅ (1024D -> 3D)
print("🧠 Fitting PCA Model...")
self.pca = PCA(n_components=3)
self.base_3d = self.pca.fit_transform(self.base_embeddings)
print("βœ… Visualization Engine Ready!")
def get_query_3d(self, query_text):
# 쿼리λ₯Ό 같은 3D κ³΅κ°„μœΌλ‘œ 투영
prefixed_query = f"query: {query_text}"
query_vec = self.model.encode([prefixed_query], normalize_embeddings=True)
query_3d = self.pca.transform(query_vec)
return query_3d[0].tolist()
viz = NyangVisualizer()
@app.get("/data")
def get_data(query: str = ""):
# λ°°κ²½ 포인트 데이터
points = []
for i in range(len(viz.base_3d)):
meta = viz.base_metas[i]
# [Nyang V4] λ‹€ν˜•μ„± 메타데이터 μΆ”μΆœ 둜직 🦁
# 1μˆœμœ„: μƒν’ˆ λŒ€λΆ„λ₯˜ (κ°€μž₯ μ€‘μš”)
category = meta.get("category_depth1") or meta.get("main_category")
# 2μˆœμœ„: 지식 좜처 (QA 데이터 λ“±)
if not category:
source = meta.get("source")
if source:
category = f"지식_{source}" # 예: 지식_QA
# 3μˆœμœ„: 데이터 νƒ€μž…
if not category:
dtype = meta.get("type")
if dtype:
category = f"νƒ€μž…_{dtype}"
# 4μˆœμœ„: μ΅œν›„μ˜ μˆ˜λ‹¨
if not category:
category = "기타"
points.append({
"x": float(viz.base_3d[i][0]),
"y": float(viz.base_3d[i][1]),
"z": float(viz.base_3d[i][2]),
"text": viz.base_docs[i][:50] + "...", # ν…μŠ€νŠΈ 미리보기
"title": meta.get("title", "Info"),
"category": category, # 색상 및 필터링 핡심 ν‚€
"type": "database"
})
# 쿼리 포인트 데이터 (μžˆλŠ” 경우)
query_point = None
if query:
q_3d = viz.get_query_3d(query)
query_point = {
"x": q_3d[0], "y": q_3d[1], "z": q_3d[2],
"text": query, "title": "Current Query", "category": "Query", "type": "query"
}
return {"points": points, "query_point": query_point}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8001)