Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- Chunks_Complete.pkl +3 -0
- app.py +133 -0
- azure_openai.py +75 -0
- requirements.txt +5 -0
- retriver.py +140 -0
Chunks_Complete.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fff9bbf8fac7520ef32da021690c1f6af7726d328898561d2577f2c834cf0bd
|
| 3 |
+
size 6332157
|
app.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from langchain_core.messages import AIMessage, HumanMessage
|
| 3 |
+
from langchain_openai import ChatOpenAI
|
| 4 |
+
|
| 5 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 6 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 7 |
+
|
| 8 |
+
# from langchain.chat_models import AzureChatOpenAI
|
| 9 |
+
from langchain_openai import AzureChatOpenAI
|
| 10 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 11 |
+
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
|
| 12 |
+
from azure_openai import qt
|
| 13 |
+
from retriver import search_and_reconstruct
|
| 14 |
+
# Initialize an instance of AzureOpenAI using the specified settings
|
| 15 |
+
import pandas as pd
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# LLM Langchain Definition
|
| 19 |
+
OPENAI_API_KEY = "86b631a9c0294e9698e327c59ff5ac2c"
|
| 20 |
+
OPENAI_API_TYPE = "azure"
|
| 21 |
+
OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
|
| 22 |
+
# OPENAI_API_VERSION = "2024-02-01"
|
| 23 |
+
OPENAI_API_VERSION = "2024-08-01-preview"
|
| 24 |
+
# OPENAI_MODEL = "gpt4-turbo-1106"
|
| 25 |
+
OPENAI_MODEL = "gpt-4o"
|
| 26 |
+
# Initialize an instance of AzureOpenAI using the specified settings
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def get_response(chat_history, qte, knowledge, temp1, temp2, tokens1, tokens2, persona2SystemMessage, persona2UserMessage):
|
| 30 |
+
|
| 31 |
+
llm = AzureChatOpenAI(
|
| 32 |
+
openai_api_version=OPENAI_API_VERSION,
|
| 33 |
+
openai_api_key=OPENAI_API_KEY,
|
| 34 |
+
azure_endpoint=OPENAI_API_BASE,
|
| 35 |
+
openai_api_type=OPENAI_API_TYPE,
|
| 36 |
+
deployment_name=OPENAI_MODEL,
|
| 37 |
+
temperature=temp2,
|
| 38 |
+
max_tokens=tokens2
|
| 39 |
+
# Name of the deployment for identification
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
system_message_template = SystemMessagePromptTemplate.from_template("your are a helpful ai")
|
| 44 |
+
human_message_template = HumanMessagePromptTemplate.from_template("try and answer the questions {history}, knowledge {knowledge}")
|
| 45 |
+
|
| 46 |
+
# Create a chat prompt template combining system and human messages
|
| 47 |
+
prompt = ChatPromptTemplate.from_messages([system_message_template, human_message_template])
|
| 48 |
+
|
| 49 |
+
chain = prompt | llm | StrOutputParser()
|
| 50 |
+
|
| 51 |
+
return chain.stream({
|
| 52 |
+
"history": chat_history,
|
| 53 |
+
"knowledge": knowledge
|
| 54 |
+
})
|
| 55 |
+
|
| 56 |
+
placeHolderPersona1 = "place holder"
|
| 57 |
+
|
| 58 |
+
# app config
|
| 59 |
+
st.set_page_config(page_title="Reg Intel Chatbot", page_icon="🤖")
|
| 60 |
+
st.title("Reg Intel Toolbox :toolbox:")
|
| 61 |
+
|
| 62 |
+
# Sidebar for inputting personas
|
| 63 |
+
st.sidebar.title("RAG System Designer")
|
| 64 |
+
# st.sidebar.subheader("Welcome Message")
|
| 65 |
+
# welcomeMessage = st.sidebar.text_area("Define Intake Persona", value=welcomeMessage, height=300)
|
| 66 |
+
st.sidebar.header("Query Designer Config")
|
| 67 |
+
# numberOfQuestions = st.sidebar.slider("Number of Questions", min_value=0, max_value=10, step=1, value=5, key='persona1_questions')
|
| 68 |
+
persona1SystemMessage = st.sidebar.text_area("Query Designer System Message", value=placeHolderPersona1, height=300)
|
| 69 |
+
temp1 = st.sidebar.slider("Temperature", min_value=0.0, max_value=1.0, step=0.1, value=0.6, key='persona1_temp')
|
| 70 |
+
tokens1 = st.sidebar.slider("Tokens", min_value=0, max_value=4000, step=100, value=500, key='persona1_tokens')
|
| 71 |
+
st.sidebar.subheader("Number of Search Results")
|
| 72 |
+
k = st.sidebar.slider("Returned Docs", min_value=1, max_value=10, step=1, value=3, key='k')
|
| 73 |
+
|
| 74 |
+
st.sidebar.header("Engineered Prompt Config")
|
| 75 |
+
persona2SystemMessage = st.sidebar.text_area("Answer Creation System Message", value=placeHolderPersona1, height=300)
|
| 76 |
+
persona2UserMessage = st.sidebar.text_area("Answer Creation User Message", value=placeHolderPersona1, height=300)
|
| 77 |
+
temp2 = st.sidebar.slider("Temperature", min_value=0.0, max_value=1.0, step=0.1, value=0.6, key='persona2_temp')
|
| 78 |
+
tokens2 = st.sidebar.slider("Tokens", min_value=0, max_value=4000, step=100, value=500, key='persona2_tokens')
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# session state
|
| 82 |
+
if "chat_history" not in st.session_state:
|
| 83 |
+
st.session_state.chat_history = [
|
| 84 |
+
AIMessage(content="Hello, I am the GSK Reg Intel Assistant. How can I help you?"),
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# conversation
|
| 89 |
+
for message in st.session_state.chat_history:
|
| 90 |
+
if isinstance(message, AIMessage):
|
| 91 |
+
with st.chat_message("AI"):
|
| 92 |
+
st.write(message.content)
|
| 93 |
+
elif isinstance(message, HumanMessage):
|
| 94 |
+
with st.chat_message("Human"):
|
| 95 |
+
st.write(message.content)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# user input
|
| 99 |
+
user_query = st.chat_input("Type your message here...")
|
| 100 |
+
if user_query is not None and user_query != "":
|
| 101 |
+
st.session_state.chat_history.append(HumanMessage(content=user_query))
|
| 102 |
+
|
| 103 |
+
with st.chat_message("Human"):
|
| 104 |
+
st.markdown(user_query)
|
| 105 |
+
|
| 106 |
+
with st.chat_message("AI"):
|
| 107 |
+
qte = qt(persona1SystemMessage, st.session_state.chat_history, temp1, tokens1)
|
| 108 |
+
knowledge = search_and_reconstruct(qte, k)
|
| 109 |
+
response = st.write_stream(get_response(st.session_state.chat_history, qte, knowledge, temp1, temp2, tokens1, tokens2, persona2SystemMessage, persona2UserMessage))
|
| 110 |
+
|
| 111 |
+
st.session_state.chat_history.append(AIMessage(content=response))
|
| 112 |
+
st.sidebar.header("QTE and Knowledge Results")
|
| 113 |
+
st.sidebar.header("QTE")
|
| 114 |
+
st.sidebar.text(qte)
|
| 115 |
+
|
| 116 |
+
if knowledge:
|
| 117 |
+
# Prepare the data for the table
|
| 118 |
+
table_data = {
|
| 119 |
+
"Title": [entry['Title'] for entry in knowledge],
|
| 120 |
+
"Score": [entry.get('Score', 'N/A') for entry in knowledge],
|
| 121 |
+
"Page Number": [entry['PageNumber'] for entry in knowledge],
|
| 122 |
+
# "Grounding Text": [entry['ReconstructedText'] for entry in knowledge]
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
# Create a dataframe for displaying as a table
|
| 126 |
+
|
| 127 |
+
df = pd.DataFrame(table_data)
|
| 128 |
+
|
| 129 |
+
# Display the table in the sidebar
|
| 130 |
+
st.sidebar.write("### Knowledge Base Results")
|
| 131 |
+
st.sidebar.dataframe(df) # Adjust height as needed
|
| 132 |
+
else:
|
| 133 |
+
st.sidebar.write("No relevant knowledge base results found.")
|
azure_openai.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# openai
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# from langchain.chat_models import AzureChatOpenAI
|
| 5 |
+
from langchain_openai import AzureChatOpenAI
|
| 6 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 7 |
+
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
|
| 8 |
+
|
| 9 |
+
# Initialize an instance of AzureOpenAI using the specified settings
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# LLM Langchain Definition
|
| 13 |
+
OPENAI_API_KEY = "86b631a9c0294e9698e327c59ff5ac2c"
|
| 14 |
+
OPENAI_API_TYPE = "azure"
|
| 15 |
+
OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
|
| 16 |
+
# OPENAI_API_VERSION = "2024-02-01"
|
| 17 |
+
OPENAI_API_VERSION = "2024-08-01-preview"
|
| 18 |
+
# OPENAI_MODEL = "gpt4-turbo-1106"
|
| 19 |
+
OPENAI_MODEL = "gpt-4o"
|
| 20 |
+
# Initialize an instance of AzureOpenAI using the specified settings
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def get_res(user_query, history, knowledge, systemMessgae, userMessage, temp, tokens):
|
| 24 |
+
|
| 25 |
+
llm = AzureChatOpenAI(
|
| 26 |
+
openai_api_version=OPENAI_API_VERSION,
|
| 27 |
+
openai_api_key=OPENAI_API_KEY,
|
| 28 |
+
azure_endpoint=OPENAI_API_BASE,
|
| 29 |
+
openai_api_type=OPENAI_API_TYPE,
|
| 30 |
+
deployment_name=OPENAI_MODEL,
|
| 31 |
+
temperature=temp,
|
| 32 |
+
max_tokens=tokens # Name of the deployment for identification
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
system_message_template = SystemMessagePromptTemplate.from_template(systemMessgae)
|
| 36 |
+
human_message_template = HumanMessagePromptTemplate.from_template(userMessage)
|
| 37 |
+
|
| 38 |
+
# Create a chat prompt template combining system and human messages
|
| 39 |
+
prompt = ChatPromptTemplate.from_messages([system_message_template, human_message_template])
|
| 40 |
+
|
| 41 |
+
chain = prompt | llm | StrOutputParser()
|
| 42 |
+
|
| 43 |
+
return chain.stream({
|
| 44 |
+
"user_question": user_query,
|
| 45 |
+
"history": history,
|
| 46 |
+
"knowledge": knowledge
|
| 47 |
+
})
|
| 48 |
+
|
| 49 |
+
def qt(systemMessgae, history, temp, tokens):
|
| 50 |
+
|
| 51 |
+
llm = AzureChatOpenAI(
|
| 52 |
+
openai_api_version=OPENAI_API_VERSION,
|
| 53 |
+
openai_api_key=OPENAI_API_KEY,
|
| 54 |
+
azure_endpoint=OPENAI_API_BASE,
|
| 55 |
+
openai_api_type=OPENAI_API_TYPE,
|
| 56 |
+
deployment_name=OPENAI_MODEL,
|
| 57 |
+
temperature=temp,
|
| 58 |
+
max_tokens=tokens # Name of the deployment for identification
|
| 59 |
+
)
|
| 60 |
+
userMessage = """ Converstaion to date: {history}
|
| 61 |
+
Please create your optimised query
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
"""
|
| 65 |
+
system_message_template = SystemMessagePromptTemplate.from_template(systemMessgae)
|
| 66 |
+
human_message_template = HumanMessagePromptTemplate.from_template(userMessage)
|
| 67 |
+
|
| 68 |
+
# Create a chat prompt template combining system and human messages
|
| 69 |
+
prompt = ChatPromptTemplate.from_messages([system_message_template, human_message_template])
|
| 70 |
+
|
| 71 |
+
chain = prompt | llm | StrOutputParser()
|
| 72 |
+
|
| 73 |
+
return chain.invoke({
|
| 74 |
+
"history": history,
|
| 75 |
+
})
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pinecone
|
| 2 |
+
langchain
|
| 3 |
+
langchain_core
|
| 4 |
+
langchain-openai
|
| 5 |
+
streamlit
|
retriver.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Retriever function
|
| 2 |
+
|
| 3 |
+
from pinecone import Pinecone
|
| 4 |
+
from langchain_openai import AzureOpenAIEmbeddings
|
| 5 |
+
import uuid
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import streamlit as st
|
| 8 |
+
import os
|
| 9 |
+
# Initialize Pinecone client
|
| 10 |
+
# pc = Pinecone(api_key=st.secrets["PC_API_KEY"])
|
| 11 |
+
pc = Pinecone(api_key="567aca04-6fb0-40a0-ba92-a5ed30be190b")
|
| 12 |
+
index = pc.Index("openai-serverless")
|
| 13 |
+
|
| 14 |
+
# Azure OpenAI configuration
|
| 15 |
+
# os.environ["AZURE_OPENAI_API_KEY"] = st.secrets["api_key"]
|
| 16 |
+
os.environ["AZURE_OPENAI_API_KEY"] = "86b631a9c0294e9698e327c59ff5ac2c"
|
| 17 |
+
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://davidfearn-gpt4.openai.azure.com/"
|
| 18 |
+
os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] = "text-embedding-ada-002"
|
| 19 |
+
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-08-01-preview"
|
| 20 |
+
|
| 21 |
+
# Model configuration
|
| 22 |
+
embeddings_model = AzureOpenAIEmbeddings(
|
| 23 |
+
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
|
| 24 |
+
azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
|
| 25 |
+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
df_chunks = pd.read_pickle('Chunks_Complete.pkl')
|
| 29 |
+
|
| 30 |
+
def process_search_results(search_results):
|
| 31 |
+
"""
|
| 32 |
+
Processes search results to extract and organize metadata and other details.
|
| 33 |
+
|
| 34 |
+
:param search_results: List of search result matches from Pinecone.
|
| 35 |
+
:return: A list of dictionaries containing relevant metadata and scores.
|
| 36 |
+
"""
|
| 37 |
+
processed_results = []
|
| 38 |
+
|
| 39 |
+
for result in search_results:
|
| 40 |
+
processed_results.append({
|
| 41 |
+
"id": result['id'],
|
| 42 |
+
"score": result['score'],
|
| 43 |
+
"Title": result['metadata'].get('Title', ''),
|
| 44 |
+
"ChunkText": result['metadata'].get('ChunkText', ''),
|
| 45 |
+
"PageNumber": result['metadata'].get('PageNumber', ''),
|
| 46 |
+
"Chunk": result['metadata'].get('Chunk', '')
|
| 47 |
+
})
|
| 48 |
+
|
| 49 |
+
return processed_results
|
| 50 |
+
|
| 51 |
+
def reconstruct_text_from_chunks(df_chunks):
|
| 52 |
+
"""
|
| 53 |
+
Reconstructs a single string of text from the chunks in the DataFrame.
|
| 54 |
+
|
| 55 |
+
:param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
|
| 56 |
+
:return: A string combining all chunk texts in order.
|
| 57 |
+
"""
|
| 58 |
+
return " ".join(df_chunks.sort_values(by=['Chunk'])['ChunkText'].tolist())
|
| 59 |
+
|
| 60 |
+
def lookup_related_chunks(df_chunks, chunk_id):
|
| 61 |
+
"""
|
| 62 |
+
Returns all chunks matching the title and page number of the specified chunk ID,
|
| 63 |
+
including chunks from the previous and next pages, handling edge cases where
|
| 64 |
+
there is no preceding or succeeding page.
|
| 65 |
+
|
| 66 |
+
:param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
|
| 67 |
+
:param chunk_id: The unique ID of the chunk to look up.
|
| 68 |
+
:return: DataFrame with all chunks matching the title and page range of the specified chunk ID.
|
| 69 |
+
"""
|
| 70 |
+
target_chunk = df_chunks[df_chunks['ChunkID'] == chunk_id]
|
| 71 |
+
if target_chunk.empty:
|
| 72 |
+
raise ValueError("Chunk ID not found")
|
| 73 |
+
|
| 74 |
+
title = target_chunk.iloc[0]['Title']
|
| 75 |
+
page_number = target_chunk.iloc[0]['PageNumber']
|
| 76 |
+
|
| 77 |
+
# Determine the valid page range
|
| 78 |
+
min_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].min()
|
| 79 |
+
max_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].max()
|
| 80 |
+
|
| 81 |
+
page_range = [page for page in [page_number - 1, page_number, page_number + 1] if min_page <= page <= max_page]
|
| 82 |
+
|
| 83 |
+
return df_chunks[(df_chunks['Title'] == title) & (df_chunks['PageNumber'].isin(page_range))]
|
| 84 |
+
|
| 85 |
+
def qt(systemMessgae, history, temp, tokens):
|
| 86 |
+
res = qt(systemMessgae, history, temp, tokens)
|
| 87 |
+
return res
|
| 88 |
+
|
| 89 |
+
def search_and_reconstruct(query, k):
|
| 90 |
+
"""
|
| 91 |
+
Combines search, lookup of related chunks, and text reconstruction.
|
| 92 |
+
|
| 93 |
+
:param query: The query string to search for.
|
| 94 |
+
:param df_chunks: DataFrame with chunk data.
|
| 95 |
+
:param namespace: Pinecone namespace to search within.
|
| 96 |
+
:param top_k: Number of top search results to retrieve.
|
| 97 |
+
:return: A list of dictionaries with document title, page number, and reconstructed text.
|
| 98 |
+
"""
|
| 99 |
+
search_results = search_knowledgebase(query, k)
|
| 100 |
+
processed_results = process_search_results(search_results)
|
| 101 |
+
|
| 102 |
+
reconstructed_results = []
|
| 103 |
+
|
| 104 |
+
for result in processed_results:
|
| 105 |
+
chunk_id = result['id']
|
| 106 |
+
related_chunks = lookup_related_chunks(df_chunks, chunk_id)
|
| 107 |
+
reconstructed_text = reconstruct_text_from_chunks(related_chunks)
|
| 108 |
+
|
| 109 |
+
reconstructed_results.append({
|
| 110 |
+
"Title": result['Title'],
|
| 111 |
+
"PageNumber": result['PageNumber'],
|
| 112 |
+
"ReconstructedText": reconstructed_text
|
| 113 |
+
})
|
| 114 |
+
|
| 115 |
+
return reconstructed_results
|
| 116 |
+
|
| 117 |
+
def search_knowledgebase(query, k):
|
| 118 |
+
|
| 119 |
+
namespace="gskRegIntel"
|
| 120 |
+
|
| 121 |
+
"""
|
| 122 |
+
Embeds a query string and searches the vector database for similar entries.
|
| 123 |
+
|
| 124 |
+
:param query: The string to embed and search for.
|
| 125 |
+
:param namespace: Pinecone namespace to search within.
|
| 126 |
+
:param top_k: Number of top results to retrieve.
|
| 127 |
+
:return: List of search results with metadata and scores.
|
| 128 |
+
"""
|
| 129 |
+
try:
|
| 130 |
+
# Generate embedding for the query
|
| 131 |
+
query_embedding = embeddings_model.embed_query(query)
|
| 132 |
+
|
| 133 |
+
# Perform search in Pinecone
|
| 134 |
+
results = index.query(vector=query_embedding, top_k=k, namespace=namespace, include_metadata=True)
|
| 135 |
+
|
| 136 |
+
return results.matches
|
| 137 |
+
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"Error during search: {e}")
|
| 140 |
+
return []
|