File size: 3,038 Bytes
234eac0
ebc19c5
 
 
234eac0
55e7097
ebc19c5
55e7097
ebc19c5
234eac0
 
 
 
ebc19c5
234eac0
 
55e7097
234eac0
 
 
 
 
 
 
 
 
 
 
 
 
 
ebc19c5
 
 
 
55e7097
ebc19c5
55e7097
 
 
 
ebc19c5
55e7097
 
234eac0
 
 
 
 
 
 
 
55e7097
 
 
234eac0
 
 
 
 
 
 
 
 
 
 
55e7097
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234eac0
55e7097
234eac0
 
 
 
 
 
 
55e7097
234eac0
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from typing import List
import tempfile

import chainlit as cl
from chainlit.types import AskFileResponse
import fitz

from langchain_community.embeddings import OpenAIEmbeddings

from aimakerspace.text_utils import CharacterTextSplitter, TextFileLoader
from aimakerspace.openai_utils.embedding import EmbeddingModel
from aimakerspace.vectordatabase import VectorDatabase
from aimakerspace.openai_utils.chatmodel import ChatOpenAI
from aimakerspace.qa_pipeline import RerankedQAPipeline

text_splitter = CharacterTextSplitter()
embedding_model = OpenAIEmbeddings()

def process_text_file(file: AskFileResponse):

    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
        temp_file_path = temp_file.name

    with open(temp_file_path, "wb") as f:
        f.write(file.content)

    text_loader = TextFileLoader(temp_file_path)
    documents = text_loader.load_documents()
    texts = text_splitter.split_texts(documents)
    return texts

def process_pdf(file: AskFileResponse) -> list[str]:
    with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".pdf") as temp_file:
        temp_file_path = temp_file.name
        temp_file.write(file.content)
        temp_file.flush()

    text = ""
    with fitz.open(temp_file_path) as doc:
        for page in doc:
            text += page.get_text().strip()

    text_list = text_splitter.split_texts(text)  
    return text_list

@cl.on_chat_start
async def on_chat_start():
    files = None

    # Wait for the user to upload a file
    while files == None:
        files = await cl.AskFileMessage(
            content="Please upload a Text File file to begin!",
            accept=["text/plain"],
            max_size_mb=20,
            timeout=180,
        ).send()

    file = files[0]

    msg = cl.Message(
        content=f"Processing `{file.name}`...", disable_human_feedback=True
    )
    await msg.send()

    # load the file
    texts = process_text_file(file)

    if not texts:
        await cl.Message(content=f"Error: Could not extract any text from input file").send()
    else:
        print(f"Processing {len(texts)} text chunks")

        # Create a dict vector store
        vector_db = VectorDatabase()
        vector_db = await vector_db.abuild_from_list(texts)
        
        chat_openai = ChatOpenAI()

        # Create a chain
        retrieval_augmented_qa_pipeline = RerankedQAPipeline(
            vector_db_retriever=vector_db,
            llm=chat_openai,
        )
        
        # Let the user know that the system is ready
        msg.content = f"Processing `{file.name}` done. You can now ask questions!"
        await msg.update()

        cl.user_session.set("chain", retrieval_augmented_qa_pipeline)


@cl.on_message
async def main(message):
    chain = cl.user_session.get("chain")

    msg = cl.Message(content="")
    result = await chain.arun_pipeline(message.content,rerank=True)

    async for stream_resp in result["response"]:
        await msg.stream_token(stream_resp)

    await msg.send()