davidfearne commited on
Commit
51fdff8
·
verified ·
1 Parent(s): 13516e9

Upload 5 files

Browse files
Files changed (5) hide show
  1. Chunks_Complete.pkl +3 -0
  2. app.py +133 -0
  3. azure_openai.py +75 -0
  4. requirements.txt +5 -0
  5. retriver.py +140 -0
Chunks_Complete.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fff9bbf8fac7520ef32da021690c1f6af7726d328898561d2577f2c834cf0bd
3
+ size 6332157
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_core.messages import AIMessage, HumanMessage
3
+ from langchain_openai import ChatOpenAI
4
+
5
+ from langchain_core.output_parsers import StrOutputParser
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+
8
+ # from langchain.chat_models import AzureChatOpenAI
9
+ from langchain_openai import AzureChatOpenAI
10
+ from langchain.schema import HumanMessage, SystemMessage
11
+ from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
12
+ from azure_openai import qt
13
+ from retriver import search_and_reconstruct
14
+ # Initialize an instance of AzureOpenAI using the specified settings
15
+ import pandas as pd
16
+
17
+
18
+ # LLM Langchain Definition
19
+ OPENAI_API_KEY = "86b631a9c0294e9698e327c59ff5ac2c"
20
+ OPENAI_API_TYPE = "azure"
21
+ OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
22
+ # OPENAI_API_VERSION = "2024-02-01"
23
+ OPENAI_API_VERSION = "2024-08-01-preview"
24
+ # OPENAI_MODEL = "gpt4-turbo-1106"
25
+ OPENAI_MODEL = "gpt-4o"
26
+ # Initialize an instance of AzureOpenAI using the specified settings
27
+
28
+
29
+ def get_response(chat_history, qte, knowledge, temp1, temp2, tokens1, tokens2, persona2SystemMessage, persona2UserMessage):
30
+
31
+ llm = AzureChatOpenAI(
32
+ openai_api_version=OPENAI_API_VERSION,
33
+ openai_api_key=OPENAI_API_KEY,
34
+ azure_endpoint=OPENAI_API_BASE,
35
+ openai_api_type=OPENAI_API_TYPE,
36
+ deployment_name=OPENAI_MODEL,
37
+ temperature=temp2,
38
+ max_tokens=tokens2
39
+ # Name of the deployment for identification
40
+ )
41
+
42
+
43
+ system_message_template = SystemMessagePromptTemplate.from_template("your are a helpful ai")
44
+ human_message_template = HumanMessagePromptTemplate.from_template("try and answer the questions {history}, knowledge {knowledge}")
45
+
46
+ # Create a chat prompt template combining system and human messages
47
+ prompt = ChatPromptTemplate.from_messages([system_message_template, human_message_template])
48
+
49
+ chain = prompt | llm | StrOutputParser()
50
+
51
+ return chain.stream({
52
+ "history": chat_history,
53
+ "knowledge": knowledge
54
+ })
55
+
56
+ placeHolderPersona1 = "place holder"
57
+
58
+ # app config
59
+ st.set_page_config(page_title="Reg Intel Chatbot", page_icon="🤖")
60
+ st.title("Reg Intel Toolbox :toolbox:")
61
+
62
+ # Sidebar for inputting personas
63
+ st.sidebar.title("RAG System Designer")
64
+ # st.sidebar.subheader("Welcome Message")
65
+ # welcomeMessage = st.sidebar.text_area("Define Intake Persona", value=welcomeMessage, height=300)
66
+ st.sidebar.header("Query Designer Config")
67
+ # numberOfQuestions = st.sidebar.slider("Number of Questions", min_value=0, max_value=10, step=1, value=5, key='persona1_questions')
68
+ persona1SystemMessage = st.sidebar.text_area("Query Designer System Message", value=placeHolderPersona1, height=300)
69
+ temp1 = st.sidebar.slider("Temperature", min_value=0.0, max_value=1.0, step=0.1, value=0.6, key='persona1_temp')
70
+ tokens1 = st.sidebar.slider("Tokens", min_value=0, max_value=4000, step=100, value=500, key='persona1_tokens')
71
+ st.sidebar.subheader("Number of Search Results")
72
+ k = st.sidebar.slider("Returned Docs", min_value=1, max_value=10, step=1, value=3, key='k')
73
+
74
+ st.sidebar.header("Engineered Prompt Config")
75
+ persona2SystemMessage = st.sidebar.text_area("Answer Creation System Message", value=placeHolderPersona1, height=300)
76
+ persona2UserMessage = st.sidebar.text_area("Answer Creation User Message", value=placeHolderPersona1, height=300)
77
+ temp2 = st.sidebar.slider("Temperature", min_value=0.0, max_value=1.0, step=0.1, value=0.6, key='persona2_temp')
78
+ tokens2 = st.sidebar.slider("Tokens", min_value=0, max_value=4000, step=100, value=500, key='persona2_tokens')
79
+
80
+
81
+ # session state
82
+ if "chat_history" not in st.session_state:
83
+ st.session_state.chat_history = [
84
+ AIMessage(content="Hello, I am the GSK Reg Intel Assistant. How can I help you?"),
85
+ ]
86
+
87
+
88
+ # conversation
89
+ for message in st.session_state.chat_history:
90
+ if isinstance(message, AIMessage):
91
+ with st.chat_message("AI"):
92
+ st.write(message.content)
93
+ elif isinstance(message, HumanMessage):
94
+ with st.chat_message("Human"):
95
+ st.write(message.content)
96
+
97
+
98
+ # user input
99
+ user_query = st.chat_input("Type your message here...")
100
+ if user_query is not None and user_query != "":
101
+ st.session_state.chat_history.append(HumanMessage(content=user_query))
102
+
103
+ with st.chat_message("Human"):
104
+ st.markdown(user_query)
105
+
106
+ with st.chat_message("AI"):
107
+ qte = qt(persona1SystemMessage, st.session_state.chat_history, temp1, tokens1)
108
+ knowledge = search_and_reconstruct(qte, k)
109
+ response = st.write_stream(get_response(st.session_state.chat_history, qte, knowledge, temp1, temp2, tokens1, tokens2, persona2SystemMessage, persona2UserMessage))
110
+
111
+ st.session_state.chat_history.append(AIMessage(content=response))
112
+ st.sidebar.header("QTE and Knowledge Results")
113
+ st.sidebar.header("QTE")
114
+ st.sidebar.text(qte)
115
+
116
+ if knowledge:
117
+ # Prepare the data for the table
118
+ table_data = {
119
+ "Title": [entry['Title'] for entry in knowledge],
120
+ "Score": [entry.get('Score', 'N/A') for entry in knowledge],
121
+ "Page Number": [entry['PageNumber'] for entry in knowledge],
122
+ # "Grounding Text": [entry['ReconstructedText'] for entry in knowledge]
123
+ }
124
+
125
+ # Create a dataframe for displaying as a table
126
+
127
+ df = pd.DataFrame(table_data)
128
+
129
+ # Display the table in the sidebar
130
+ st.sidebar.write("### Knowledge Base Results")
131
+ st.sidebar.dataframe(df) # Adjust height as needed
132
+ else:
133
+ st.sidebar.write("No relevant knowledge base results found.")
azure_openai.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # openai
2
+ import os
3
+
4
+ # from langchain.chat_models import AzureChatOpenAI
5
+ from langchain_openai import AzureChatOpenAI
6
+ from langchain_core.output_parsers import StrOutputParser
7
+ from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
8
+
9
+ # Initialize an instance of AzureOpenAI using the specified settings
10
+
11
+
12
+ # LLM Langchain Definition
13
+ OPENAI_API_KEY = "86b631a9c0294e9698e327c59ff5ac2c"
14
+ OPENAI_API_TYPE = "azure"
15
+ OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
16
+ # OPENAI_API_VERSION = "2024-02-01"
17
+ OPENAI_API_VERSION = "2024-08-01-preview"
18
+ # OPENAI_MODEL = "gpt4-turbo-1106"
19
+ OPENAI_MODEL = "gpt-4o"
20
+ # Initialize an instance of AzureOpenAI using the specified settings
21
+
22
+
23
+ def get_res(user_query, history, knowledge, systemMessgae, userMessage, temp, tokens):
24
+
25
+ llm = AzureChatOpenAI(
26
+ openai_api_version=OPENAI_API_VERSION,
27
+ openai_api_key=OPENAI_API_KEY,
28
+ azure_endpoint=OPENAI_API_BASE,
29
+ openai_api_type=OPENAI_API_TYPE,
30
+ deployment_name=OPENAI_MODEL,
31
+ temperature=temp,
32
+ max_tokens=tokens # Name of the deployment for identification
33
+ )
34
+
35
+ system_message_template = SystemMessagePromptTemplate.from_template(systemMessgae)
36
+ human_message_template = HumanMessagePromptTemplate.from_template(userMessage)
37
+
38
+ # Create a chat prompt template combining system and human messages
39
+ prompt = ChatPromptTemplate.from_messages([system_message_template, human_message_template])
40
+
41
+ chain = prompt | llm | StrOutputParser()
42
+
43
+ return chain.stream({
44
+ "user_question": user_query,
45
+ "history": history,
46
+ "knowledge": knowledge
47
+ })
48
+
49
+ def qt(systemMessgae, history, temp, tokens):
50
+
51
+ llm = AzureChatOpenAI(
52
+ openai_api_version=OPENAI_API_VERSION,
53
+ openai_api_key=OPENAI_API_KEY,
54
+ azure_endpoint=OPENAI_API_BASE,
55
+ openai_api_type=OPENAI_API_TYPE,
56
+ deployment_name=OPENAI_MODEL,
57
+ temperature=temp,
58
+ max_tokens=tokens # Name of the deployment for identification
59
+ )
60
+ userMessage = """ Converstaion to date: {history}
61
+ Please create your optimised query
62
+
63
+
64
+ """
65
+ system_message_template = SystemMessagePromptTemplate.from_template(systemMessgae)
66
+ human_message_template = HumanMessagePromptTemplate.from_template(userMessage)
67
+
68
+ # Create a chat prompt template combining system and human messages
69
+ prompt = ChatPromptTemplate.from_messages([system_message_template, human_message_template])
70
+
71
+ chain = prompt | llm | StrOutputParser()
72
+
73
+ return chain.invoke({
74
+ "history": history,
75
+ })
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pinecone
2
+ langchain
3
+ langchain_core
4
+ langchain-openai
5
+ streamlit
retriver.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Retriever function
2
+
3
+ from pinecone import Pinecone
4
+ from langchain_openai import AzureOpenAIEmbeddings
5
+ import uuid
6
+ import pandas as pd
7
+ import streamlit as st
8
+ import os
9
+ # Initialize Pinecone client
10
+ # pc = Pinecone(api_key=st.secrets["PC_API_KEY"])
11
+ pc = Pinecone(api_key="567aca04-6fb0-40a0-ba92-a5ed30be190b")
12
+ index = pc.Index("openai-serverless")
13
+
14
+ # Azure OpenAI configuration
15
+ # os.environ["AZURE_OPENAI_API_KEY"] = st.secrets["api_key"]
16
+ os.environ["AZURE_OPENAI_API_KEY"] = "86b631a9c0294e9698e327c59ff5ac2c"
17
+ os.environ["AZURE_OPENAI_ENDPOINT"] = "https://davidfearn-gpt4.openai.azure.com/"
18
+ os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] = "text-embedding-ada-002"
19
+ os.environ["AZURE_OPENAI_API_VERSION"] = "2024-08-01-preview"
20
+
21
+ # Model configuration
22
+ embeddings_model = AzureOpenAIEmbeddings(
23
+ azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
24
+ azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
25
+ openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
26
+ )
27
+
28
+ df_chunks = pd.read_pickle('Chunks_Complete.pkl')
29
+
30
+ def process_search_results(search_results):
31
+ """
32
+ Processes search results to extract and organize metadata and other details.
33
+
34
+ :param search_results: List of search result matches from Pinecone.
35
+ :return: A list of dictionaries containing relevant metadata and scores.
36
+ """
37
+ processed_results = []
38
+
39
+ for result in search_results:
40
+ processed_results.append({
41
+ "id": result['id'],
42
+ "score": result['score'],
43
+ "Title": result['metadata'].get('Title', ''),
44
+ "ChunkText": result['metadata'].get('ChunkText', ''),
45
+ "PageNumber": result['metadata'].get('PageNumber', ''),
46
+ "Chunk": result['metadata'].get('Chunk', '')
47
+ })
48
+
49
+ return processed_results
50
+
51
+ def reconstruct_text_from_chunks(df_chunks):
52
+ """
53
+ Reconstructs a single string of text from the chunks in the DataFrame.
54
+
55
+ :param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
56
+ :return: A string combining all chunk texts in order.
57
+ """
58
+ return " ".join(df_chunks.sort_values(by=['Chunk'])['ChunkText'].tolist())
59
+
60
+ def lookup_related_chunks(df_chunks, chunk_id):
61
+ """
62
+ Returns all chunks matching the title and page number of the specified chunk ID,
63
+ including chunks from the previous and next pages, handling edge cases where
64
+ there is no preceding or succeeding page.
65
+
66
+ :param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
67
+ :param chunk_id: The unique ID of the chunk to look up.
68
+ :return: DataFrame with all chunks matching the title and page range of the specified chunk ID.
69
+ """
70
+ target_chunk = df_chunks[df_chunks['ChunkID'] == chunk_id]
71
+ if target_chunk.empty:
72
+ raise ValueError("Chunk ID not found")
73
+
74
+ title = target_chunk.iloc[0]['Title']
75
+ page_number = target_chunk.iloc[0]['PageNumber']
76
+
77
+ # Determine the valid page range
78
+ min_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].min()
79
+ max_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].max()
80
+
81
+ page_range = [page for page in [page_number - 1, page_number, page_number + 1] if min_page <= page <= max_page]
82
+
83
+ return df_chunks[(df_chunks['Title'] == title) & (df_chunks['PageNumber'].isin(page_range))]
84
+
85
+ def qt(systemMessgae, history, temp, tokens):
86
+ res = qt(systemMessgae, history, temp, tokens)
87
+ return res
88
+
89
+ def search_and_reconstruct(query, k):
90
+ """
91
+ Combines search, lookup of related chunks, and text reconstruction.
92
+
93
+ :param query: The query string to search for.
94
+ :param df_chunks: DataFrame with chunk data.
95
+ :param namespace: Pinecone namespace to search within.
96
+ :param top_k: Number of top search results to retrieve.
97
+ :return: A list of dictionaries with document title, page number, and reconstructed text.
98
+ """
99
+ search_results = search_knowledgebase(query, k)
100
+ processed_results = process_search_results(search_results)
101
+
102
+ reconstructed_results = []
103
+
104
+ for result in processed_results:
105
+ chunk_id = result['id']
106
+ related_chunks = lookup_related_chunks(df_chunks, chunk_id)
107
+ reconstructed_text = reconstruct_text_from_chunks(related_chunks)
108
+
109
+ reconstructed_results.append({
110
+ "Title": result['Title'],
111
+ "PageNumber": result['PageNumber'],
112
+ "ReconstructedText": reconstructed_text
113
+ })
114
+
115
+ return reconstructed_results
116
+
117
+ def search_knowledgebase(query, k):
118
+
119
+ namespace="gskRegIntel"
120
+
121
+ """
122
+ Embeds a query string and searches the vector database for similar entries.
123
+
124
+ :param query: The string to embed and search for.
125
+ :param namespace: Pinecone namespace to search within.
126
+ :param top_k: Number of top results to retrieve.
127
+ :return: List of search results with metadata and scores.
128
+ """
129
+ try:
130
+ # Generate embedding for the query
131
+ query_embedding = embeddings_model.embed_query(query)
132
+
133
+ # Perform search in Pinecone
134
+ results = index.query(vector=query_embedding, top_k=k, namespace=namespace, include_metadata=True)
135
+
136
+ return results.matches
137
+
138
+ except Exception as e:
139
+ print(f"Error during search: {e}")
140
+ return []