Spaces:

1MR
/

MYRAG

Sleeping

App Files Files Community

MYRAG / app.py

1MR

Update app.py

bdf523c verified about 1 year ago

raw

history blame contribute delete

14.9 kB

	import streamlit as st
	from dotenv import load_dotenv
	from PyPDF2 import PdfReader
	from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
	from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
	from langchain.vectorstores import FAISS, Chroma
	from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
	from langchain.chat_models import ChatOpenAI
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from htmlTemplates import css, bot_template, user_template
	from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
	from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
	import tempfile # 임시 파일을 생성하기 위한 라이브러리입니다.
	import os


	# PDF 문서로부터 텍스트를 추출하는 함수입니다.
	def get_pdf_text(pdf_docs):
	temp_dir = tempfile.TemporaryDirectory() # 임시 디렉토리를 생성합니다.
	temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # 임시 파일 경로를 생성합니다.
	with open(temp_filepath, "wb") as f: # 임시 파일을 바이너리 쓰기 모드로 엽니다.
	f.write(pdf_docs.getvalue()) # PDF 문서의 내용을 임시 파일에 씁니다.
	pdf_loader = PyPDFLoader(temp_filepath) # PyPDFLoader를 사용해 PDF를 로드합니다.
	pdf_doc = pdf_loader.load() # 텍스트를 추출합니다.
	return pdf_doc # 추출한 텍스트를 반환합니다.

	# 과제
	# 아래 텍스트 추출 함수를 작성

	def get_text_file(text_docs):
	temp_dir = tempfile.TemporaryDirectory()
	temp_filepath = os.path.join(temp_dir.name, text_docs.name)
	with open(temp_filepath, "wb") as f:
	f.write(text_docs.getvalue())
	text_loader = TextLoader(temp_filepath)
	text_doc = text_loader.load()
	return text_doc


	def get_csv_file(csv_docs):
	temp_dir = tempfile.TemporaryDirectory()
	temp_filepath = os.path.join(temp_dir.name, csv_docs.name)
	with open(temp_filepath, "wb") as f:
	f.write(csv_docs.getvalue())
	csv_loader = CSVLoader(temp_filepath)
	csv_doc = csv_loader.load()
	return csv_doc

	def get_json_file(json_docs):
	temp_dir = tempfile.TemporaryDirectory()
	temp_filepath = os.path.join(temp_dir.name, json_docs.name)
	with open(temp_filepath, "wb") as f:
	f.write(json_docs.getvalue())
	json_loader = JSONLoader(temp_filepath)
	json_doc = json_loader.load()
	return json_doc


	# 문서들을 처리하여 텍스트 청크로 나누는 함수입니다.
	def get_text_chunks(documents):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, # 청크의 크기를 지정합니다.
	chunk_overlap=200, # 청크 사이의 중복을 지정합니다.
	length_function=len # 텍스트의 길이를 측정하는 함수를 지정합니다.
	)

	documents = text_splitter.split_documents(documents) # 문서들을 청크로 나눕니다
	return documents # 나눈 청크를 반환합니다.


	# 텍스트 청크들로부터 벡터 스토어를 생성하는 함수입니다.
	def get_vectorstore(text_chunks):
	# OpenAI 임베딩 모델을 로드합니다. (Embedding models - Ada v2)

	# embeddings = OpenAIEmbeddings()
	# embeddings=HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

	vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS 벡터 스토어를 생성합니다.

	return vectorstore # 생성된 벡터 스토어를 반환합니다.


	def get_conversation_chain(vectorstore):
	llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512},huggingfacehub_api_token=os.getenv("TOKEN_API2"))

	# 대화 기록을 저장하기 위한 메모리를 생성합니다.
	memory = ConversationBufferMemory(
	memory_key='chat_history', return_messages=True)
	# 대화 검색 체인을 생성합니다.
	conversation_chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=vectorstore.as_retriever(),
	memory=memory
	)
	return conversation_chain

	# 사용자 입력을 처리하는 함수입니다.
	# def handle_userinput(user_question):
	# # 대화 체인을 사용하여 사용자 질문에 대한 응답을 생성합니다.
	# response = st.session_state.conversation({'question': user_question})
	# # 대화 기록을 저장합니다.
	# st.session_state.chat_history = response['chat_history']

	# for i, message in enumerate(st.session_state.chat_history):
	# if i % 2 == 0:
	# st.write(user_template.replace(
	# "{{MSG}}", message.content), unsafe_allow_html=True)
	# else:
	# st.write(bot_template.replace(
	# "{{MSG}}", message.content), unsafe_allow_html=True)


	def handle_userinput(user_question):
	if not st.session_state.conversation:
	st.error("Please upload and process your documents first.")
	return

	try:
	response = st.session_state.conversation({'question': user_question})
	st.session_state.chat_history = response['chat_history']

	for i, message in enumerate(st.session_state.chat_history):
	if i % 2 == 0:
	st.write(user_template.replace(
	"{{MSG}}", message.content), unsafe_allow_html=True)
	else:
	st.write(bot_template.replace(
	"{{MSG}}", message.content), unsafe_allow_html=True)
	except Exception as e:
	st.error(f"An error occurred: {e}")


	def main():
	load_dotenv()
	st.set_page_config(page_title="Chat with multiple Files",
	page_icon=":books:")
	st.write(css, unsafe_allow_html=True)

	if "conversation" not in st.session_state:
	st.session_state.conversation = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = None

	st.header("Chat with multiple Files:")
	user_question = st.text_input("Ask a question about your documents:")
	if user_question:
	handle_userinput(user_question)

	with st.sidebar:
	openai_key = st.text_input("Paste your OpenAI API key (sk-...)")
	if openai_key:
	os.environ["OPENAI_API_KEY"] = openai_key

	st.subheader("Your documents")
	docs = st.file_uploader(
	"Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
	if st.button("Process"):
	if not docs:
	st.error("Please upload at least one document.")
	return

	with st.spinner("Processing..."):
	try:
	doc_list = []

	for file in docs:
	if file.type == 'text/plain':
	doc_list.extend(get_text_file(file))
	elif file.type in ['application/octet-stream', 'application/pdf']:
	doc_list.extend(get_pdf_text(file))
	elif file.type == 'text/csv':
	doc_list.extend(get_csv_file(file))
	elif file.type == 'application/json':
	doc_list.extend(get_json_file(file))

	if not doc_list:
	st.error("No valid documents processed. Please check your files.")
	return

	text_chunks = get_text_chunks(doc_list)

	vectorstore = get_vectorstore(text_chunks)

	st.session_state.conversation = get_conversation_chain(vectorstore)

	st.success("Documents processed successfully!")
	except Exception as e:
	st.error(f"An error occurred during processing: {e}")


	if __name__ == '__main__':
	main()
	# def main():
	# load_dotenv()
	# st.set_page_config(page_title="Chat with multiple Files",
	# page_icon=":books:")
	# st.write(css, unsafe_allow_html=True)

	# if "conversation" not in st.session_state:
	# st.session_state.conversation = None
	# if "chat_history" not in st.session_state:
	# st.session_state.chat_history = None

	# st.header("Chat with multiple Files :")
	# user_question = st.text_input("Ask a question about your documents:")
	# if user_question:
	# handle_userinput(user_question)

	# with st.sidebar:
	# openai_key = st.text_input("Paste your OpenAI API key (sk-...)")
	# if openai_key:
	# os.environ["OPENAI_API_KEY"] = openai_key

	# st.subheader("Your documents")
	# docs = st.file_uploader(
	# "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
	# if st.button("Process"):
	# with st.spinner("Processing"):
	# # get pdf text
	# doc_list = []

	# for file in docs:
	# print('file - type : ', file.type)
	# if file.type == 'text/plain':
	# # file is .txt
	# doc_list.extend(get_text_file(file))
	# elif file.type in ['application/octet-stream', 'application/pdf']:
	# # file is .pdf
	# doc_list.extend(get_pdf_text(file))
	# elif file.type == 'text/csv':
	# # file is .csv
	# doc_list.extend(get_csv_file(file))
	# elif file.type == 'application/json':
	# # file is .json
	# doc_list.extend(get_json_file(file))

	# # get the text chunks
	# text_chunks = get_text_chunks(doc_list)

	# # create vector store
	# vectorstore = get_vectorstore(text_chunks)

	# # create conversation chain
	# st.session_state.conversation = get_conversation_chain(
	# vectorstore)


	# import streamlit as st
	# # from dotenv import load_dotenv
	# from PyPDF2 import PdfReader
	# from langchain.text_splitter import CharacterTextSplitter
	# from langchain_community.embeddings import HuggingFaceInstructEmbeddings
	# from langchain_community.vectorstores import FAISS
	# # from langchain.chat_models import ChatOpenAI
	# from langchain.memory import ConversationBufferMemory
	# from langchain.chains import ConversationalRetrievalChain
	# from htmlTemplates import css, bot_template, user_template
	# from langchain_community.llms import HuggingFaceHub
	# import os
	# # from sentence_transformers import SentenceTransformer
	# from langchain.embeddings import HuggingFaceEmbeddings


	# # from huggingface_hub import login

	# # Retrieve the Hugging Face token from environment variables
	# # token = os.getenv("HUGGINGFACEHUB_TOKEN")
	# import fitz # PyMuPDF

	# def get_pdf_text(pdf_docs):
	# text = ""
	# for pdf in pdf_docs:
	# try:
	# doc = fitz.open(stream=pdf.read(), filetype="pdf")
	# for page in doc:
	# text += page.get_text()
	# except Exception as e:
	# st.error(f"Could not read the file: {pdf.name}. Error: {e}")
	# return text
	# # def get_pdf_text(pdf_docs):
	# # text = ""
	# # for pdf in pdf_docs:
	# # pdf_reader = PdfReader(pdf)
	# # for page in pdf_reader.pages:
	# # text += page.extract_text()
	# # return text

	# def get_text_chunks(text):
	# text_splitter=CharacterTextSplitter(
	# separator="\n",
	# chunk_size=1000,
	# chunk_overlap=200,
	# length_function=len
	# )
	# chunks=text_splitter.split_text(text)
	# return chunks

	# # token="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1"
	# # def get_vectorstore(text_chunks):
	# # # embeddings=HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",huggingfacehub_token=os.getenv("TOKEN_API2"))
	# # embeddings=HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
	# # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	# # return vectorstore

	# # def get_vectorstore(text_chunks):
	# # # Load a SentenceTransformer model for embeddings
	# # embedding_model = SentenceTransformer("hkunlp/instructor-xl") # Replace with a model of your choice
	# # embeddings = [embedding_model.encode(chunk) for chunk in text_chunks]

	# # # Create a FAISS vectorstore
	# # vectorstore = FAISS.from_embeddings(embeddings=embeddings, texts=text_chunks)
	# # return vectorstore

	# def get_vectorstore(text_chunks):
	# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	# vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	# return vectorstore

	# def get_conversation_chain(vectorstore):
	# llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512},huggingfacehub_api_token=os.getenv("TOKEN_API2"))
	# memory=ConversationBufferMemory(
	# memory_key='chat_history',return_messages=True)
	# conversation_chain = ConversationalRetrievalChain.from_llm(
	# llm=llm,
	# retriever=vectorstore.as_retriever(),
	# memory=memory
	# )
	# return conversation_chain

	# def handle_userinput(user_question):
	# response = st.session_state.conversation({'question':user_question})
	# st.session_state.chat_history = response['chat_history']

	# for i, message in enumerate(st.session_state.chat_history):
	# if i % 2 == 0:
	# st.write(user_template.replace("{{MSG}}", message.content),unsafe_allow_html=True)
	# else:
	# st.write(bot_template.replace("{{MSG}}", message.content),unsafe_allow_html=True)

	# def main():
	# st.set_page_config(page_title="Chat with My RAG",
	# page_icon=":books:")
	# st.write(css,unsafe_allow_html=True)

	# if "conversation" not in st.session_state:
	# st.session_state.conversation = None
	# if "chat_history" not in st.session_state:
	# st.session_state.chat_history = None

	# st.header("Chat with My RAG :books:")
	# user_question=st.text_input("Ask a question about your documents:")
	# if user_question:
	# handle_userinput(user_question)

	# with st.sidebar:
	# st.subheader("Your Documents")
	# pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
	# if st.button("Process"):
	# with st.spinner("Processing"):
	# raw_text =get_pdf_text(pdf_docs)

	# text_chunks = get_text_chunks(raw_text)

	# vectorstore = get_vectorstore(text_chunks)

	# st.session_state.conversation = get_conversation_chain(vectorstore)


	# if __name__ == '__main__':
	# main()