NickNYU commited on
Commit
17b19d4
·
1 Parent(s): 4bd25fd

try to leverage ChatGPT to query questions

Browse files
Files changed (2) hide show
  1. app.py +84 -41
  2. requirements.txt +6 -13
app.py CHANGED
@@ -1,42 +1,85 @@
1
- from transformers import AutoTokenizer, AutoModel
2
- from langchain.document_loaders import TextLoader
3
- from langchain.embeddings import HuggingFaceEmbeddings
4
- from langchain.llms import HuggingFaceHub
5
- from langchain.text_splitter import CharacterTextSplitter
 
 
6
  from langchain.vectorstores import FAISS
7
-
8
- # 选择 embedding 算法
9
- model_name = "sentence-transformers/all-mpnet-base-v2"
10
- model_kwargs = {'device': 'cpu'}
11
- encode_kwargs = {'normalize_embeddings': False}
12
- eb = HuggingFaceEmbeddings(
13
- model_name=model_name,
14
- model_kwargs=model_kwargs,
15
- encode_kwargs=encode_kwargs
16
- )
17
- # 从文件读取数据
18
- loader = TextLoader("./test.txt")
19
- docs = loader.load()
20
- # 分词
21
- text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=50)
22
- tl = []
23
- for doc in docs:
24
- for t in text_splitter.split_text(doc.page_content):
25
- tl.append(t)
26
- # 向量数据库
27
- vector_store = FAISS.from_texts(tl, eb)
28
- query = '为什么client不直接从data server拉取数据'
29
- docs = vector_store.similarity_search(query, k=4)
30
-
31
- # 构造提问
32
- context = ''
33
- for i in range(len(docs)):
34
- context += f'{i+1}、{docs[i].page_content}\n'
35
- prompt = f'已知:\n{context}\n问题:\n{query}'
36
- print(prompt)
37
-
38
- tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True)
39
- model = AutoModel.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True).float()
40
- model = model.eval()
41
- response, history = model.chat(tokenizer, prompt, history=[])
42
- print(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ import pickle
4
+ from PyPDF2 import PdfReader
5
+ from streamlit_extras.add_vertical_space import add_vertical_space
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.embeddings.openai import OpenAIEmbeddings
8
  from langchain.vectorstores import FAISS
9
+ from langchain.llms import OpenAI
10
+ from langchain.chains.question_answering import load_qa_chain
11
+ from langchain.callbacks import get_openai_callback
12
+ import os
13
+
14
+ # Sidebar contents
15
+ with st.sidebar:
16
+ st.title('🤗💬 LLM Chat App')
17
+ st.markdown('''
18
+ ## About
19
+ This app is an LLM-powered chatbot built using:
20
+ - [Streamlit](https://streamlit.io/)
21
+ - [LangChain](https://python.langchain.com/)
22
+ - [OpenAI](https://platform.openai.com/docs/models) LLM model
23
+
24
+ ''')
25
+ add_vertical_space(5)
26
+ st.write('Made with ❤️ by [Prompt Engineer](https://youtube.com/@engineerprompt)')
27
+
28
+ load_dotenv()
29
+
30
+ def main():
31
+ st.header("Chat with PDF 💬")
32
+
33
+
34
+ # upload a PDF file
35
+ pdf = st.file_uploader("Upload your PDF", type='pdf')
36
+
37
+ # st.write(pdf)
38
+ if pdf is not None:
39
+ pdf_reader = PdfReader(pdf)
40
+
41
+ text = ""
42
+ for page in pdf_reader.pages:
43
+ text += page.extract_text()
44
+
45
+ text_splitter = RecursiveCharacterTextSplitter(
46
+ chunk_size=1000,
47
+ chunk_overlap=200,
48
+ length_function=len
49
+ )
50
+ chunks = text_splitter.split_text(text=text)
51
+
52
+ # # embeddings
53
+ store_name = pdf.name[:-4]
54
+ st.write(f'{store_name}')
55
+ # st.write(chunks)
56
+
57
+ if os.path.exists(f"{store_name}.pkl"):
58
+ with open(f"{store_name}.pkl", "rb") as f:
59
+ VectorStore = pickle.load(f)
60
+ # st.write('Embeddings Loaded from the Disk')s
61
+ else:
62
+ embeddings = OpenAIEmbeddings()
63
+ VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
64
+ with open(f"{store_name}.pkl", "wb") as f:
65
+ pickle.dump(VectorStore, f)
66
+
67
+ # embeddings = OpenAIEmbeddings()
68
+ # VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
69
+
70
+ # Accept user questions/query
71
+ query = st.text_input("Ask questions about your PDF file:")
72
+ # st.write(query)
73
+
74
+ if query:
75
+ docs = VectorStore.similarity_search(query=query, k=3)
76
+
77
+ llm = OpenAI()
78
+ chain = load_qa_chain(llm=llm, chain_type="stuff")
79
+ with get_openai_callback() as cb:
80
+ response = chain.run(input_documents=docs, question=query)
81
+ print(cb)
82
+ st.write(response)
83
+
84
+ if __name__ == '__main__':
85
+ main()
requirements.txt CHANGED
@@ -1,13 +1,6 @@
1
- protobuf
2
- transformers==4.27.1
3
- cpm_kernels
4
- torch>=1.10
5
- gradio
6
- mdtex2html
7
- sentencepiece
8
- accelerate
9
- langchain
10
- sentence_transformers
11
- unstructured
12
- pdf2image
13
- faiss-cpu
 
1
+ langchain==0.0.154
2
+ PyPDF2==3.0.1
3
+ python-dotenv==1.0.0
4
+ streamlit==1.18.1
5
+ faiss-cpu==1.7.4
6
+ streamlit-extras