File size: 1,945 Bytes
ee646ee
 
 
 
 
 
 
 
 
 
ae67217
 
ee646ee
 
 
ae67217
ee646ee
 
 
 
 
 
 
 
 
 
 
 
 
 
f9c2194
 
 
ee646ee
 
 
 
 
 
 
 
 
 
 
 
 
ae67217
ee646ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from huggingface_hub import snapshot_download
import os
import shutil

app = FastAPI()

# CORS β€” λͺ¨λ“  좜처 ν—ˆμš© (Claude μ•„ν‹°νŒ©νŠΈ 포함)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=False,
    allow_methods=["*"],
    allow_headers=["*"],
)

DB_LOCAL_PATH = "./chroma_db"

if not os.path.exists(DB_LOCAL_PATH):
    print("πŸ”„ HuggingFaceμ—μ„œ DB λ‹€μš΄λ‘œλ“œ 쀑...")
    snapshot_download(
        repo_id=os.environ["HF_REPO_ID"],
        repo_type="dataset",
        token=os.environ["HF_TOKEN"],
        local_dir="./hf_data",
    )
    #shutil.copytree("./hf_data/chroma_db", DB_LOCAL_PATH)
    src = "./hf_data/chroma_db" if os.path.exists("./hf_data/chroma_db") else "./hf_data"
    shutil.copytree(src, DB_LOCAL_PATH)
    print("βœ… DB λ‹€μš΄λ‘œλ“œ μ™„λ£Œ")

print("πŸ”„ μž„λ² λ”© λͺ¨λΈ λ‘œλ”© 쀑...")
embeddings = HuggingFaceEmbeddings(model_name="jhgan/ko-sroberta-multitask")
db = Chroma(persist_directory=DB_LOCAL_PATH, embedding_function=embeddings)
print(f"βœ… DB λ‘œλ“œ μ™„λ£Œ β€” 청크 수: {db._collection.count()}")

class QueryRequest(BaseModel):
    query: str
    k: int = 5

@app.get("/")
def root():
    return {"status": "ok", "chunks": db._collection.count()}

@app.get("/health")
def health():
    return {"status": "ok", "chunks": db._collection.count()}

@app.post("/retrieve")
def retrieve(req: QueryRequest):
    docs = db.similarity_search(req.query, k=req.k)
    return {
        "documents": [
            {
                "text": doc.page_content,
                "source": doc.metadata.get("source", "μ•Œ 수 μ—†μŒ"),
                "region": doc.metadata.get("region", "μ•Œ 수 μ—†μŒ"),
            }
            for doc in docs
        ]
    }