File size: 3,038 Bytes
234eac0 ebc19c5 234eac0 55e7097 ebc19c5 55e7097 ebc19c5 234eac0 ebc19c5 234eac0 55e7097 234eac0 ebc19c5 55e7097 ebc19c5 55e7097 ebc19c5 55e7097 234eac0 55e7097 234eac0 55e7097 234eac0 55e7097 234eac0 55e7097 234eac0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
from typing import List
import tempfile
import chainlit as cl
from chainlit.types import AskFileResponse
import fitz
from langchain_community.embeddings import OpenAIEmbeddings
from aimakerspace.text_utils import CharacterTextSplitter, TextFileLoader
from aimakerspace.openai_utils.embedding import EmbeddingModel
from aimakerspace.vectordatabase import VectorDatabase
from aimakerspace.openai_utils.chatmodel import ChatOpenAI
from aimakerspace.qa_pipeline import RerankedQAPipeline
text_splitter = CharacterTextSplitter()
embedding_model = OpenAIEmbeddings()
def process_text_file(file: AskFileResponse):
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
temp_file_path = temp_file.name
with open(temp_file_path, "wb") as f:
f.write(file.content)
text_loader = TextFileLoader(temp_file_path)
documents = text_loader.load_documents()
texts = text_splitter.split_texts(documents)
return texts
def process_pdf(file: AskFileResponse) -> list[str]:
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".pdf") as temp_file:
temp_file_path = temp_file.name
temp_file.write(file.content)
temp_file.flush()
text = ""
with fitz.open(temp_file_path) as doc:
for page in doc:
text += page.get_text().strip()
text_list = text_splitter.split_texts(text)
return text_list
@cl.on_chat_start
async def on_chat_start():
files = None
# Wait for the user to upload a file
while files == None:
files = await cl.AskFileMessage(
content="Please upload a Text File file to begin!",
accept=["text/plain"],
max_size_mb=20,
timeout=180,
).send()
file = files[0]
msg = cl.Message(
content=f"Processing `{file.name}`...", disable_human_feedback=True
)
await msg.send()
# load the file
texts = process_text_file(file)
if not texts:
await cl.Message(content=f"Error: Could not extract any text from input file").send()
else:
print(f"Processing {len(texts)} text chunks")
# Create a dict vector store
vector_db = VectorDatabase()
vector_db = await vector_db.abuild_from_list(texts)
chat_openai = ChatOpenAI()
# Create a chain
retrieval_augmented_qa_pipeline = RerankedQAPipeline(
vector_db_retriever=vector_db,
llm=chat_openai,
)
# Let the user know that the system is ready
msg.content = f"Processing `{file.name}` done. You can now ask questions!"
await msg.update()
cl.user_session.set("chain", retrieval_augmented_qa_pipeline)
@cl.on_message
async def main(message):
chain = cl.user_session.get("chain")
msg = cl.Message(content="")
result = await chain.arun_pipeline(message.content,rerank=True)
async for stream_resp in result["response"]:
await msg.stream_token(stream_resp)
await msg.send() |