santuchal commited on
Commit
46414bc
·
verified ·
1 Parent(s): 427610f

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. app.py +63 -0
  3. chat_workflow.py +95 -0
  4. mydocs/1.pdf +3 -0
  5. requirements.txt +102 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ mydocs/1.pdf filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import streamlit as st
3
+ from chat_workflow import chain_workflow
4
+
5
+ # Custom image for the app icon and the assistant's avatar
6
+ assistant_logo = 'https://assets.website-files.com/5f902c64ef70f699f7a0c50d/64b7aa8bcb0b1ad4dd48b451_AI_icon_3.png'
7
+
8
+ # Configure Streamlit page
9
+ st.set_page_config(
10
+ page_title="Budget-GPT 2024-2025",
11
+ page_icon=assistant_logo
12
+ )
13
+
14
+ with st.sidebar:
15
+ openai_api_key = st.text_input('Input your OpenAI API Key', value="sk-", type = 'password')
16
+ "[View the source code](https://github.com/codysaint/streamlit-pdf-qa-langchain-app.git)"
17
+
18
+
19
+ # Initialize chat history
20
+ if 'messages' not in st.session_state:
21
+ # Start with first message from assistant
22
+ st.session_state['messages'] = [{"role": "assistant",
23
+ "content": "Hi user! ask me questions about union budget 2024-2025"}]
24
+
25
+ for message in st.session_state.messages:
26
+ if message["role"] == 'assistant':
27
+ with st.chat_message(message["role"], avatar=assistant_logo):
28
+ st.markdown(message["content"])
29
+ else:
30
+ with st.chat_message(message["role"]):
31
+ st.markdown(message["content"])
32
+
33
+ # Chat logic
34
+ if query := st.chat_input("Ask me about key highlights of recently announced union budget"):
35
+ if len(openai_api_key) <= 3:
36
+ st.sidebar.error("☝️ Put in your openapi key")
37
+ else:
38
+ # Add user message to chat history
39
+ st.session_state.messages.append({"role": "user", "content": query})
40
+ # Display user message in chat message container
41
+ with st.chat_message("user"):
42
+ st.markdown(query)
43
+
44
+ with st.chat_message("assistant", avatar=assistant_logo):
45
+ message_placeholder = st.empty()
46
+ # Send user's question to our chain
47
+
48
+ # Initialize LLM chain
49
+ chain = chain_workflow(openai_api_key=openai_api_key)
50
+ result = chain({"question": query})
51
+ response = result['answer']
52
+ full_response = ""
53
+
54
+ # Simulate stream of response with milliseconds delay
55
+ for chunk in response.split():
56
+ full_response += chunk + " "
57
+ time.sleep(0.05)
58
+ # Add a blinking cursor to simulate typing
59
+ message_placeholder.markdown(full_response + "▌")
60
+ message_placeholder.markdown(full_response)
61
+
62
+ # Add assistant message to chat history
63
+ st.session_state.messages.append({"role": "assistant", "content": response})
chat_workflow.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.chains import ConversationalRetrievalChain
3
+ from langchain.memory import ConversationBufferWindowMemory
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.embeddings import OpenAIEmbeddings
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.retrievers import ContextualCompressionRetriever
8
+ from langchain.retrievers.document_compressors import LLMChainExtractor
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
11
+ import os
12
+
13
+
14
+ def create_directory_if_not_exists(directory_path):
15
+ if not os.path.exists(directory_path):
16
+ os.makedirs(directory_path, exist_ok=True)
17
+ else:
18
+ print(f" {directory_path} already exists")
19
+
20
+ #llm
21
+ llm_name = "gpt-3.5-turbo"
22
+
23
+ # persist_directory
24
+ persist_directory = 'vector_index/'
25
+
26
+ create_directory_if_not_exists(persist_directory)
27
+
28
+ docs_dir = 'mydocs/'
29
+
30
+ docs_sqlite_store_chroma = os.path.join(persist_directory, "chroma_vec_store.sqlite3")
31
+
32
+ # @st.cache_resource
33
+ def chain_workflow(openai_api_key):
34
+
35
+ # Load OpenAI embedding model
36
+ embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
37
+
38
+
39
+ # Check if the file exists
40
+ if not os.path.exists(docs_sqlite_store_chroma):
41
+ # If it doesn't exist, create it
42
+
43
+ # load multiple pdfs at once
44
+ # loader = PyPDFDirectoryLoader(docs_dir)
45
+
46
+ # load single document
47
+ file = os.path.join(docs_dir, "key_highlights.pdf")
48
+ loader = PyPDFLoader(file)
49
+
50
+ documents = loader.load()
51
+
52
+ # split documents
53
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
54
+ splits = text_splitter.split_documents(documents)
55
+
56
+ # persist_directory
57
+ # persist_directory = 'vector_index/'
58
+
59
+ vectordb = Chroma.from_documents(
60
+ documents=splits,
61
+ embedding=embeddings,
62
+ persist_directory=persist_directory
63
+ )
64
+
65
+
66
+ vectordb.persist()
67
+ print(f"Vectorstore created and saved successfully, The {docs_sqlite_store_chroma} file has been created.")
68
+ else:
69
+ # if vectorstore already exist, just call it
70
+ vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
71
+
72
+
73
+ # Load OpenAI chat model
74
+ llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)
75
+
76
+ # specify a retrieval to retrieve relevant splits or documents
77
+ compressor = LLMChainExtractor.from_llm(llm)
78
+ compression_retriever = ContextualCompressionRetriever(base_compressor=compressor,base_retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 3}))
79
+
80
+
81
+ # Create memory 'chat_history'
82
+ memory = ConversationBufferWindowMemory(k=5,memory_key="chat_history")
83
+
84
+ # create a chatbot chain
85
+ qa = ConversationalRetrievalChain.from_llm(
86
+ llm=ChatOpenAI(model_name=llm_name, temperature=0.7, openai_api_key=openai_api_key),
87
+ chain_type="stuff",
88
+ retriever=compression_retriever,
89
+ memory=memory,
90
+ get_chat_history=lambda h : h,
91
+ verbose=True
92
+ )
93
+
94
+
95
+ return qa
mydocs/1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78b1d0b8faa9b8a4219028726393154e35bb256404631e2d5566158c69a43ee5
3
+ size 1682781
requirements.txt ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.5
2
+ aiosignal==1.3.1
3
+ altair==5.1.1
4
+ annotated-types==0.5.0
5
+ anyio==3.7.1
6
+ async-timeout==4.0.3
7
+ attrs==23.1.0
8
+ backoff==2.2.1
9
+ bcrypt==4.0.1
10
+ blinker==1.6.2
11
+ cachetools==5.3.1
12
+ certifi==2023.7.22
13
+ charset-normalizer==3.3.0
14
+ chroma-hnswlib==0.7.3
15
+ chromadb==0.4.13
16
+ click==8.1.7
17
+ coloredlogs==15.0.1
18
+ dataclasses-json==0.6.1
19
+ fastapi==0.103.2
20
+ filelock==3.12.4
21
+ flatbuffers==23.5.26
22
+ frozenlist==1.4.0
23
+ fsspec==2023.9.2
24
+ gitdb==4.0.10
25
+ GitPython==3.1.37
26
+ greenlet==2.0.2
27
+ h11==0.14.0
28
+ httptools==0.6.0
29
+ huggingface-hub==0.16.4
30
+ humanfriendly==10.0
31
+ idna==3.4
32
+ importlib-metadata==6.8.0
33
+ importlib-resources==6.1.0
34
+ Jinja2==3.1.2
35
+ jsonpatch==1.33
36
+ jsonpointer==2.4
37
+ jsonschema==4.19.1
38
+ jsonschema-specifications==2023.7.1
39
+ langchain==0.0.305
40
+ langsmith==0.0.41
41
+ markdown-it-py==3.0.0
42
+ MarkupSafe==2.1.3
43
+ marshmallow==3.20.1
44
+ mdurl==0.1.2
45
+ monotonic==1.6
46
+ mpmath==1.3.0
47
+ multidict==6.0.4
48
+ mypy-extensions==1.0.0
49
+ numexpr==2.8.7
50
+ numpy==1.26.0
51
+ onnxruntime==1.16.0
52
+ openai==0.28.1
53
+ overrides==7.4.0
54
+ packaging==23.1
55
+ pandas==2.1.1
56
+ Pillow==10.0.1
57
+ posthog==3.0.2
58
+ protobuf==4.24.3
59
+ pulsar-client==3.3.0
60
+ pyarrow==13.0.0
61
+ pydantic==2.4.2
62
+ pydantic_core==2.10.1
63
+ pydeck==0.8.1b0
64
+ Pygments==2.16.1
65
+ pypdf==3.16.2
66
+ PyPika==0.48.9
67
+ python-dateutil==2.8.2
68
+ python-dotenv==1.0.0
69
+ pytz==2023.3.post1
70
+ PyYAML==6.0.1
71
+ referencing==0.30.2
72
+ regex==2023.8.8
73
+ requests==2.31.0
74
+ rich==13.6.0
75
+ rpds-py==0.10.3
76
+ six==1.16.0
77
+ smmap==5.0.1
78
+ sniffio==1.3.0
79
+ SQLAlchemy==2.0.21
80
+ starlette==0.27.0
81
+ streamlit==1.27.1
82
+ sympy==1.12
83
+ tenacity==8.2.3
84
+ tiktoken==0.5.1
85
+ tokenizers==0.14.0
86
+ toml==0.10.2
87
+ toolz==0.12.0
88
+ tornado==6.3.3
89
+ tqdm==4.66.1
90
+ typer==0.9.0
91
+ typing-inspect==0.9.0
92
+ typing_extensions==4.8.0
93
+ tzdata==2023.3
94
+ tzlocal==5.0.1
95
+ urllib3==2.0.5
96
+ uvicorn==0.23.2
97
+ validators==0.22.0
98
+ watchdog==3.0.0
99
+ watchfiles==0.20.0
100
+ websockets==11.0.3
101
+ yarl==1.9.2
102
+ zipp==3.17.0