File size: 2,402 Bytes
ff72db3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import asyncio
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

async def process_and_store_file(file_path, user_id, websocket=None, upload_directory="./uploaded_files"):
    try:
        # 1. PDF ํŒŒ์ผ ๋กœ๋“œ
        if websocket:
            await websocket.send_text("1. PDF ํŒŒ์ผ ๋กœ๋“œ ์ค‘...")
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        if websocket:
            await websocket.send_text(f"PDF ํŒŒ์ผ ๋กœ๋“œ ์™„๋ฃŒ: {len(documents)} ๋ฌธ์„œ")
    except Exception as e:
        if websocket:
            await websocket.send_text(f"PDF ํŒŒ์ผ ๋กœ๋“œ ์˜ค๋ฅ˜: {e}")
        return

    try:
        # 2. ํ…์ŠคํŠธ ๋ถ„ํ• 
        if websocket:
            await websocket.send_text("2. ํ…์ŠคํŠธ ๋ถ„ํ•  ์ค‘...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
        docs = text_splitter.split_documents(documents)
        if websocket:
            await websocket.send_text(f"ํ…์ŠคํŠธ ๋ถ„ํ•  ์™„๋ฃŒ: {len(docs)} ์ฒญํฌ")
    except Exception as e:
        if websocket:
            await websocket.send_text(f"ํ…์ŠคํŠธ ๋ถ„ํ•  ์˜ค๋ฅ˜: {e}")
        return

    try:
        # 3. ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ๋ฐ ๋ฒกํ„ฐํ™”
        if websocket:
            await websocket.send_text("3. ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ๋ฐ ๋ฒกํ„ฐํ™” ์ค‘...")
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
        vectors = FAISS.from_documents(docs, embeddings)

        # 4. ๋ฒกํ„ฐ ์ €์žฅ
        db_path = os.path.join(upload_directory, "faiss_index")
        vectors.save_local(db_path)
        if websocket:
            await websocket.send_text(f"FAISS ์ธ๋ฑ์Šค ์ €์žฅ ์™„๋ฃŒ: {db_path}")
    except Exception as e:
        if websocket:
            await websocket.send_text(f"๋ฒกํ„ฐํ™” ์˜ค๋ฅ˜: {e}")
        return
    finally:
        # 5. ํŒŒ์ผ ์‚ญ์ œ
        try:
            if os.path.exists(file_path):
                os.remove(file_path)
                if websocket:
                    await websocket.send_text(f"ํŒŒ์ผ ์‚ญ์ œ ์™„๋ฃŒ: {file_path}")
        except Exception as e:
            if websocket:
                await websocket.send_text(f"ํŒŒ์ผ ์‚ญ์ œ ์˜ค๋ฅ˜: {e}")