File size: 2,402 Bytes
ff72db3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import os
import asyncio
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
async def process_and_store_file(file_path, user_id, websocket=None, upload_directory="./uploaded_files"):
try:
# 1. PDF ํ์ผ ๋ก๋
if websocket:
await websocket.send_text("1. PDF ํ์ผ ๋ก๋ ์ค...")
loader = PyPDFLoader(file_path)
documents = loader.load()
if websocket:
await websocket.send_text(f"PDF ํ์ผ ๋ก๋ ์๋ฃ: {len(documents)} ๋ฌธ์")
except Exception as e:
if websocket:
await websocket.send_text(f"PDF ํ์ผ ๋ก๋ ์ค๋ฅ: {e}")
return
try:
# 2. ํ
์คํธ ๋ถํ
if websocket:
await websocket.send_text("2. ํ
์คํธ ๋ถํ ์ค...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
docs = text_splitter.split_documents(documents)
if websocket:
await websocket.send_text(f"ํ
์คํธ ๋ถํ ์๋ฃ: {len(docs)} ์ฒญํฌ")
except Exception as e:
if websocket:
await websocket.send_text(f"ํ
์คํธ ๋ถํ ์ค๋ฅ: {e}")
return
try:
# 3. ์๋ฒ ๋ฉ ์์ฑ ๋ฐ ๋ฒกํฐํ
if websocket:
await websocket.send_text("3. ์๋ฒ ๋ฉ ์์ฑ ๋ฐ ๋ฒกํฐํ ์ค...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
vectors = FAISS.from_documents(docs, embeddings)
# 4. ๋ฒกํฐ ์ ์ฅ
db_path = os.path.join(upload_directory, "faiss_index")
vectors.save_local(db_path)
if websocket:
await websocket.send_text(f"FAISS ์ธ๋ฑ์ค ์ ์ฅ ์๋ฃ: {db_path}")
except Exception as e:
if websocket:
await websocket.send_text(f"๋ฒกํฐํ ์ค๋ฅ: {e}")
return
finally:
# 5. ํ์ผ ์ญ์
try:
if os.path.exists(file_path):
os.remove(file_path)
if websocket:
await websocket.send_text(f"ํ์ผ ์ญ์ ์๋ฃ: {file_path}")
except Exception as e:
if websocket:
await websocket.send_text(f"ํ์ผ ์ญ์ ์ค๋ฅ: {e}")
|