Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import chromadb | |
| import numpy as np | |
| from fastapi import FastAPI | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from sklearn.decomposition import PCA | |
| from sentence_transformers import SentenceTransformer | |
| import torch | |
| import uvicorn | |
| # [Nyang V4 3D Visualizer Server] π¦π | |
| # 34.6λ§ κ°μ μ§μ 곡κ°μ 3Dλ‘ μ€μκ° ν¬μν©λλ€. | |
| app = FastAPI() | |
| app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) | |
| # μ€μ | |
| DB_PATH = r"v4_advanced_rag/data/vector_db_v4" | |
| MODEL_NAME = "nlpai-lab/KURE-v1" | |
| COLLECTION_NAME = "nyang_ultimate_knowledge" | |
| SAMPLE_SIZE = 50000 # μ§λ°°μΈλμ μμ²λλ‘ λν νλ (λΈλΌμ°μ νκ³ κ³ λ €) | |
| class NyangVisualizer: | |
| def __init__(self): | |
| print("βοΈ Loading Model and DB for Visualization...") | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.model = SentenceTransformer(MODEL_NAME, device=self.device) | |
| if self.device == "cuda": self.model.half() | |
| self.client = chromadb.PersistentClient(path=DB_PATH) | |
| self.collection = self.client.get_collection(name=COLLECTION_NAME) | |
| # 1. κ³ μ λ λ°°κ²½ λ°μ΄ν°(Starfield) μ€λΉ | |
| print(f"π Fetching {SAMPLE_SIZE} samples from DB...") | |
| results = self.collection.get(include=['embeddings', 'metadatas', 'documents'], limit=SAMPLE_SIZE) | |
| self.base_embeddings = np.array(results['embeddings']) | |
| self.base_metas = results['metadatas'] | |
| self.base_docs = results['documents'] | |
| # 2. PCA λͺ¨λΈ νμ΅ (1024D -> 3D) | |
| print("π§ Fitting PCA Model...") | |
| self.pca = PCA(n_components=3) | |
| self.base_3d = self.pca.fit_transform(self.base_embeddings) | |
| print("β Visualization Engine Ready!") | |
| def get_query_3d(self, query_text): | |
| # 쿼리λ₯Ό κ°μ 3D 곡κ°μΌλ‘ ν¬μ | |
| prefixed_query = f"query: {query_text}" | |
| query_vec = self.model.encode([prefixed_query], normalize_embeddings=True) | |
| query_3d = self.pca.transform(query_vec) | |
| return query_3d[0].tolist() | |
| viz = NyangVisualizer() | |
| def get_data(query: str = ""): | |
| # λ°°κ²½ ν¬μΈνΈ λ°μ΄ν° | |
| points = [] | |
| for i in range(len(viz.base_3d)): | |
| meta = viz.base_metas[i] | |
| # [Nyang V4] λ€νμ± λ©νλ°μ΄ν° μΆμΆ λ‘μ§ π¦ | |
| # 1μμ: μν λλΆλ₯ (κ°μ₯ μ€μ) | |
| category = meta.get("category_depth1") or meta.get("main_category") | |
| # 2μμ: μ§μ μΆμ² (QA λ°μ΄ν° λ±) | |
| if not category: | |
| source = meta.get("source") | |
| if source: | |
| category = f"μ§μ_{source}" # μ: μ§μ_QA | |
| # 3μμ: λ°μ΄ν° νμ | |
| if not category: | |
| dtype = meta.get("type") | |
| if dtype: | |
| category = f"νμ _{dtype}" | |
| # 4μμ: μ΅νμ μλ¨ | |
| if not category: | |
| category = "κΈ°ν" | |
| points.append({ | |
| "x": float(viz.base_3d[i][0]), | |
| "y": float(viz.base_3d[i][1]), | |
| "z": float(viz.base_3d[i][2]), | |
| "text": viz.base_docs[i][:50] + "...", # ν μ€νΈ 미리보기 | |
| "title": meta.get("title", "Info"), | |
| "category": category, # μμ λ° νν°λ§ ν΅μ¬ ν€ | |
| "type": "database" | |
| }) | |
| # 쿼리 ν¬μΈνΈ λ°μ΄ν° (μλ κ²½μ°) | |
| query_point = None | |
| if query: | |
| q_3d = viz.get_query_3d(query) | |
| query_point = { | |
| "x": q_3d[0], "y": q_3d[1], "z": q_3d[2], | |
| "text": query, "title": "Current Query", "category": "Query", "type": "query" | |
| } | |
| return {"points": points, "query_point": query_point} | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=8001) | |