demo / app /modules /embedding.py
tekville's picture
Initial commit
ff72db3
import os
import asyncio
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
async def process_and_store_file(file_path, user_id, websocket=None, upload_directory="./uploaded_files"):
try:
# 1. PDF ํŒŒ์ผ ๋กœ๋“œ
if websocket:
await websocket.send_text("1. PDF ํŒŒ์ผ ๋กœ๋“œ ์ค‘...")
loader = PyPDFLoader(file_path)
documents = loader.load()
if websocket:
await websocket.send_text(f"PDF ํŒŒ์ผ ๋กœ๋“œ ์™„๋ฃŒ: {len(documents)} ๋ฌธ์„œ")
except Exception as e:
if websocket:
await websocket.send_text(f"PDF ํŒŒ์ผ ๋กœ๋“œ ์˜ค๋ฅ˜: {e}")
return
try:
# 2. ํ…์ŠคํŠธ ๋ถ„ํ• 
if websocket:
await websocket.send_text("2. ํ…์ŠคํŠธ ๋ถ„ํ•  ์ค‘...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
docs = text_splitter.split_documents(documents)
if websocket:
await websocket.send_text(f"ํ…์ŠคํŠธ ๋ถ„ํ•  ์™„๋ฃŒ: {len(docs)} ์ฒญํฌ")
except Exception as e:
if websocket:
await websocket.send_text(f"ํ…์ŠคํŠธ ๋ถ„ํ•  ์˜ค๋ฅ˜: {e}")
return
try:
# 3. ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ๋ฐ ๋ฒกํ„ฐํ™”
if websocket:
await websocket.send_text("3. ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ๋ฐ ๋ฒกํ„ฐํ™” ์ค‘...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
vectors = FAISS.from_documents(docs, embeddings)
# 4. ๋ฒกํ„ฐ ์ €์žฅ
db_path = os.path.join(upload_directory, "faiss_index")
vectors.save_local(db_path)
if websocket:
await websocket.send_text(f"FAISS ์ธ๋ฑ์Šค ์ €์žฅ ์™„๋ฃŒ: {db_path}")
except Exception as e:
if websocket:
await websocket.send_text(f"๋ฒกํ„ฐํ™” ์˜ค๋ฅ˜: {e}")
return
finally:
# 5. ํŒŒ์ผ ์‚ญ์ œ
try:
if os.path.exists(file_path):
os.remove(file_path)
if websocket:
await websocket.send_text(f"ํŒŒ์ผ ์‚ญ์ œ ์™„๋ฃŒ: {file_path}")
except Exception as e:
if websocket:
await websocket.send_text(f"ํŒŒ์ผ ์‚ญ์ œ ์˜ค๋ฅ˜: {e}")