Langchain / app.py
sujeongim0402@gmail.com
edit codes
beefdef
import csv
import json
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
import tempfile # ์ž„์‹œ ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜๊ธฐ ์œ„ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์ž…๋‹ˆ๋‹ค.
import os
# PDF ๋ฌธ์„œ๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
def get_pdf_text(pdf_docs):
temp_dir = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ ์ƒ์„ฑ
with open(temp_filepath, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ด๊ธฐ
f.write(pdf_docs.getvalue()) # PDF ๋ฌธ์„œ ๋‚ด์šฉ ์ž„์‹œ ํŒŒ์ผ์— ์“ฐ๊ธฐ
pdf_loader = PyPDFLoader(temp_filepath) # PyPDFLoader๋กœ PDF ๋กœ๋“œ
pdf_doc = pdf_loader.load() # ํ…์ŠคํŠธ ์ถ”์ถœ
return pdf_doc # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ ๋ฐ˜ํ™˜
# ๊ณผ์ œ
# ์•„๋ž˜ ํ…์ŠคํŠธ ์ถ”์ถœ ํ•จ์ˆ˜๋ฅผ ์ž‘์„ฑ
def get_text_file(docs):
temp_dir2 = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
temp_filepath2 = os.path.join(temp_dir2.name, docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ ์ƒ์„ฑ
with open(temp_filepath2, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ด๊ธฐ
f.write(docs.getvalue()) # text ๋ฌธ์„œ์˜ ๋‚ด์šฉ ์ž„์‹œ ํŒŒ์ผ์— ์“ฐ๊ธฐ
txt_loader = TextLoader( # TextLoader๋กœ text ํŒŒ์ผ ๋กœ๋“œ
file_path=temp_filepath2, # text ๋ฌธ์„œ์˜ ๋‚ด์šฉ์ด ์“ฐ์ธ ํŒŒ์ผ ๊ฒฝ๋กœ
txt_args={
"delimiter": " ", # ๋‚ด์šฉ์€ ๋„์–ด์“ฐ๊ธฐ๋กœ ๊ตฌ๋ถ„
# ์ž‘๋™์„ ์•ˆ ํ•ด์„œ ์ž„์˜๋กœ ๋‚ด์šฉ ๋„ฃ๊ธฐ
#"content":'"What is the most important thing in Team project? I think it is communication. No matter how good an individual ability is I think it is difficult to achieve good results without communicating with each other a lot."'
}
)
txt_data = txt_loader.load() # ์ถ”์ถœ๋œ ํ…์ŠคํŠธ ์ €์žฅ
return txt_data # ์ถ”์ถœ๋œ ํ…์ŠคํŠธ ๋ฐ˜ํ™˜
def get_csv_file(docs):
temp_dir3 = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
temp_filepath3 = os.path.join(temp_dir3.name, docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ ์ƒ์„ฑ
with open(temp_filepath3, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ด๊ธฐ
f.write(docs.getvalue()) # csv ๋ฌธ์„œ์˜ ๋‚ด์šฉ ์ž„์‹œ ํŒŒ์ผ์— ์“ฐ๊ธฐ
csv_loader = CSVLoader( # CSVLoader๋กœ csv ํŒŒ์ผ ๋กœ๋“œ
file_path=temp_filepath3, # CSV ๋ฌธ์„œ์˜ ๋‚ด์šฉ์ด ์“ฐ์ธ ํŒŒ์ผ ๊ฒฝ๋กœ
csv_args={
"delimiter": ",", # ๋‚ด์šฉ์€ ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„
"quotechar": '"', # ๋ฌธ์ž์—ด์€ "" ์•ˆ์— ์“ฐ์ž„
"fieldnames": ["name", "school", "address", "phone"], # ํ•„๋“œ ์ด๋ฆ„ ๋‚˜์—ด
},
)
csv_data = csv_loader.load() # ์ถ”์ถœ๋œ ํ…์ŠคํŠธ ์ €์žฅ
return csv_data # ์ถ”์ถœ๋œ ํ…์ŠคํŠธ ๋ฐ˜ํ™˜
def get_json_file(docs):
temp_dir4 = tempfile.TemporaryDirectory() # ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
temp_filepath4 = os.path.join(temp_dir4.name, docs.name) # ์ž„์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ ์ƒ์„ฑ
with open(temp_filepath4, "wb") as f: # ์ž„์‹œ ํŒŒ์ผ ๋ฐ”์ด๋„ˆ๋ฆฌ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ด๊ธฐ
f.write(docs.getvalue()) # json ๋ฌธ์„œ์˜ ๋‚ด์šฉ ์ž„์‹œ ํŒŒ์ผ์— ์“ฐ๊ธฐ
json_loader = JSONLoader( # JSONLoader๋กœ json ํŒŒ์ผ ๋กœ๋“œ
file_path=temp_filepath4, # json ๋ฌธ์„œ์˜ ๋‚ด์šฉ์ด ์“ฐ์ธ ํŒŒ์ผ ๊ฒฝ๋กœ
jq_schema='.messages[].content', # json ๋ฌธ์„œ์—์„œ ์ถ”์ถœํ•  ๋‚ด์šฉ ์„ค์ •(์ฑ„ํŒ… ๋ฉ”์‹œ์ง€)
text_content=False # ์ถ”์ถœํ•œ ๋ฐ์ดํ„ฐ๋Š” ํ…์ŠคํŠธ ํ˜•์‹์œผ๋กœ
)
json_data = json_loader.load() # ์ถ”์ถœ๋œ ํ…์ŠคํŠธ ์ €์žฅ
return json_data # ์ถ”์ถœ๋œ ํ…์ŠคํŠธ ๋ฐ˜ํ™˜
# ๋ฌธ์„œ๋“ค์„ ์ฒ˜๋ฆฌํ•˜์—ฌ ํ…์ŠคํŠธ ์ฒญํฌ๋กœ ๋‚˜๋ˆ„๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
def get_text_chunks(documents):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # ์ฒญํฌ์˜ ํฌ๊ธฐ๋ฅผ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
chunk_overlap=200, # ์ฒญํฌ ์‚ฌ์ด์˜ ์ค‘๋ณต์„ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
length_function=len # ํ…์ŠคํŠธ์˜ ๊ธธ์ด๋ฅผ ์ธก์ •ํ•˜๋Š” ํ•จ์ˆ˜๋ฅผ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
)
documents = text_splitter.split_documents(documents) # ๋ฌธ์„œ๋“ค์„ ์ฒญํฌ๋กœ ๋‚˜๋ˆ•๋‹ˆ๋‹ค
return documents # ๋‚˜๋ˆˆ ์ฒญํฌ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
# ํ…์ŠคํŠธ ์ฒญํฌ๋“ค๋กœ๋ถ€ํ„ฐ ๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ์ƒ์„ฑํ•˜๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
def get_vectorstore(text_chunks):
# OpenAI ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค. (Embedding models - Ada v2)
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS ๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
return vectorstore # ์ƒ์„ฑ๋œ ๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
def get_conversation_chain(vectorstore):
gpt_model_name = 'gpt-3.5-turbo'
llm = ChatOpenAI(model_name = gpt_model_name) #gpt-3.5 ๋ชจ๋ธ ๋กœ๋“œ
# ๋Œ€ํ™” ๊ธฐ๋ก์„ ์ €์žฅํ•˜๊ธฐ ์œ„ํ•œ ๋ฉ”๋ชจ๋ฆฌ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)
# ๋Œ€ํ™” ๊ฒ€์ƒ‰ ์ฒด์ธ์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
return conversation_chain
# ์‚ฌ์šฉ์ž ์ž…๋ ฅ์„ ์ฒ˜๋ฆฌํ•˜๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
def handle_userinput(user_question):
# ๋Œ€ํ™” ์ฒด์ธ์„ ์‚ฌ์šฉํ•˜์—ฌ ์‚ฌ์šฉ์ž ์งˆ๋ฌธ์— ๋Œ€ํ•œ ์‘๋‹ต์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
response = st.session_state.conversation({'question': user_question})
# ๋Œ€ํ™” ๊ธฐ๋ก์„ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
st.session_state.chat_history = response['chat_history']
for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
st.write(user_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
else:
st.write(bot_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
def main():
load_dotenv()
st.set_page_config(page_title="Chat with multiple Files",
page_icon=":books:")
st.write(css, unsafe_allow_html=True)
if "conversation" not in st.session_state:
st.session_state.conversation = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = None
st.header("Chat with multiple Files :")
user_question = st.text_input("Ask a question about your documents:")
if user_question:
handle_userinput(user_question)
with st.sidebar:
openai_key = st.text_input("Paste your OpenAI API key (sk-...)")
if openai_key:
os.environ["OPENAI_API_KEY"] = openai_key
st.subheader("Your documents")
docs = st.file_uploader(
"Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
if st.button("Process"):
with st.spinner("Processing"):
# get pdf text
doc_list = []
for file in docs:
print('file - type : ', file.type)
if file.type == 'text/plain':
# file is .txt
doc_list.extend(get_text_file(file))
elif file.type in ['application/octet-stream', 'application/pdf']:
# file is .pdf
doc_list.extend(get_pdf_text(file))
elif file.type == 'text/csv':
# file is .csv
doc_list.extend(get_csv_file(file))
elif file.type == 'application/json':
# file is .json
doc_list.extend(get_json_file(file))
# get the text chunks
text_chunks = get_text_chunks(doc_list)
# create vector store
vectorstore = get_vectorstore(text_chunks)
# create conversation chain
st.session_state.conversation = get_conversation_chain(
vectorstore)
if __name__ == '__main__':
main()