kwmin_probin / core /embedder.py
cksleigen's picture
add files
c2f0e66
# core/embedder.py
"""์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ"""
from openai import OpenAI
from typing import List, Dict
from config.settings import OPENAI_API_KEY, EMBEDDING_MODEL
import time
client = OpenAI(api_key=OPENAI_API_KEY)
def embed_chunks(chunks: List[Dict]) -> List[Dict]:
"""
์ฒญํฌ ๋ฆฌ์ŠคํŠธ๋ฅผ ์ž„๋ฒ ๋”ฉ
Args:
chunks: ์ฒญํฌ ๋ฆฌ์ŠคํŠธ
Returns:
List[Dict]: ์ž„๋ฒ ๋”ฉ์ด ์ถ”๊ฐ€๋œ ์ฒญํฌ
[
{
"chunk_id": "...",
"text": "...",
"embedding": [0.1, 0.2, ...],
...
},
...
]
"""
print(f"๐Ÿ”ข ์ž„๋ฒ ๋”ฉ ์‹œ์ž‘ ({len(chunks)}๊ฐœ ์ฒญํฌ)")
start_time = time.time()
# ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ (OpenAI๋Š” ํ•œ ๋ฒˆ์— ์—ฌ๋Ÿฌ ๊ฐœ ๊ฐ€๋Šฅ)
texts = [chunk["text"] for chunk in chunks]
response = client.embeddings.create(
model=EMBEDDING_MODEL,
input=texts
)
# ์ž„๋ฒ ๋”ฉ ์ถ”๊ฐ€
for i, chunk in enumerate(chunks):
chunk["embedding"] = response.data[i].embedding
elapsed = time.time() - start_time
print(f"โœ… ์ž„๋ฒ ๋”ฉ ์™„๋ฃŒ ({elapsed:.2f}์ดˆ)")
print(f" - ์†๋„: {len(chunks)/elapsed:.2f} chunks/sec")
return chunks
if __name__ == "__main__":
# ํ…Œ์ŠคํŠธ
test_chunks = [
{"chunk_id": "chunk_0", "text": "ํ…Œ์ŠคํŠธ ๋ฌธ์„œ์ž…๋‹ˆ๋‹ค."}
]
embedded = embed_chunks(test_chunks)
print(f"์ž„๋ฒ ๋”ฉ ์ฐจ์›: {len(embedded[0]['embedding'])}")