Spaces:

DivyaS1
/

GSKRAGDemoReplic

Sleeping

App Files Files Community

davidfearne commited on Dec 16, 2024

Commit

51fdff8

verified ·

1 Parent(s): 13516e9

Upload 5 files

Browse files

Files changed (5) hide show

Chunks_Complete.pkl +3 -0
app.py +133 -0
azure_openai.py +75 -0
requirements.txt +5 -0
retriver.py +140 -0

Chunks_Complete.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1fff9bbf8fac7520ef32da021690c1f6af7726d328898561d2577f2c834cf0bd
+size 6332157

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import streamlit as st
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+# from langchain.chat_models import AzureChatOpenAI
+from langchain_openai import AzureChatOpenAI
+from langchain.schema import HumanMessage, SystemMessage
+from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
+from azure_openai import qt
+from retriver import search_and_reconstruct
+# Initialize an instance of AzureOpenAI using the specified settings
+import pandas as pd
+# LLM Langchain Definition
+OPENAI_API_KEY = "86b631a9c0294e9698e327c59ff5ac2c"
+OPENAI_API_TYPE = "azure"
+OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
+# OPENAI_API_VERSION = "2024-02-01"
+OPENAI_API_VERSION = "2024-08-01-preview"
+# OPENAI_MODEL = "gpt4-turbo-1106"
+OPENAI_MODEL = "gpt-4o"
+# Initialize an instance of AzureOpenAI using the specified settings
+def get_response(chat_history, qte, knowledge, temp1, temp2, tokens1, tokens2, persona2SystemMessage, persona2UserMessage):
+    llm = AzureChatOpenAI(
+        openai_api_version=OPENAI_API_VERSION,
+        openai_api_key=OPENAI_API_KEY,
+        azure_endpoint=OPENAI_API_BASE,
+        openai_api_type=OPENAI_API_TYPE,
+        deployment_name=OPENAI_MODEL,
+        temperature=temp2,
+        max_tokens=tokens2
+            # Name of the deployment for identification
+    )
+    system_message_template = SystemMessagePromptTemplate.from_template("your are a helpful ai")
+    human_message_template = HumanMessagePromptTemplate.from_template("try and answer the questions {history}, knowledge {knowledge}")
+    # Create a chat prompt template combining system and human messages
+    prompt = ChatPromptTemplate.from_messages([system_message_template, human_message_template])
+    chain = prompt | llm | StrOutputParser()
+    return chain.stream({
+        "history": chat_history,
+        "knowledge": knowledge
+    })
+placeHolderPersona1 = "place holder"
+# app config
+st.set_page_config(page_title="Reg Intel Chatbot", page_icon="🤖")
+st.title("Reg Intel Toolbox :toolbox:")
+# Sidebar for inputting personas
+st.sidebar.title("RAG System Designer")
+# st.sidebar.subheader("Welcome Message")
+# welcomeMessage = st.sidebar.text_area("Define Intake Persona", value=welcomeMessage, height=300)
+st.sidebar.header("Query Designer Config")
+# numberOfQuestions = st.sidebar.slider("Number of Questions", min_value=0, max_value=10, step=1, value=5, key='persona1_questions')
+persona1SystemMessage = st.sidebar.text_area("Query Designer System Message", value=placeHolderPersona1, height=300)
+temp1 = st.sidebar.slider("Temperature", min_value=0.0, max_value=1.0, step=0.1, value=0.6, key='persona1_temp')
+tokens1 = st.sidebar.slider("Tokens", min_value=0, max_value=4000, step=100, value=500, key='persona1_tokens')
+st.sidebar.subheader("Number of Search Results")
+k = st.sidebar.slider("Returned Docs", min_value=1, max_value=10, step=1, value=3, key='k')
+st.sidebar.header("Engineered Prompt Config")
+persona2SystemMessage = st.sidebar.text_area("Answer Creation System Message", value=placeHolderPersona1, height=300)
+persona2UserMessage = st.sidebar.text_area("Answer Creation User Message", value=placeHolderPersona1, height=300)
+temp2 = st.sidebar.slider("Temperature", min_value=0.0, max_value=1.0, step=0.1, value=0.6, key='persona2_temp')
+tokens2 = st.sidebar.slider("Tokens", min_value=0, max_value=4000, step=100, value=500, key='persona2_tokens')
+# session state
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = [
+        AIMessage(content="Hello, I am the GSK Reg Intel Assistant. How can I help you?"),
+    ]
+# conversation
+for message in st.session_state.chat_history:
+    if isinstance(message, AIMessage):
+        with st.chat_message("AI"):
+            st.write(message.content)
+    elif isinstance(message, HumanMessage):
+        with st.chat_message("Human"):
+            st.write(message.content)
+# user input
+user_query = st.chat_input("Type your message here...")
+if user_query is not None and user_query != "":
+    st.session_state.chat_history.append(HumanMessage(content=user_query))
+    with st.chat_message("Human"):
+        st.markdown(user_query)
+    with st.chat_message("AI"):
+        qte = qt(persona1SystemMessage, st.session_state.chat_history, temp1, tokens1)
+        knowledge = search_and_reconstruct(qte, k)
+        response = st.write_stream(get_response(st.session_state.chat_history, qte, knowledge, temp1, temp2, tokens1, tokens2, persona2SystemMessage, persona2UserMessage))
+    st.session_state.chat_history.append(AIMessage(content=response))
+    st.sidebar.header("QTE and Knowledge Results")
+    st.sidebar.header("QTE")
+    st.sidebar.text(qte)
+    if knowledge:
+        # Prepare the data for the table
+        table_data = {
+            "Title": [entry['Title'] for entry in knowledge],
+            "Score": [entry.get('Score', 'N/A') for entry in knowledge],
+            "Page Number": [entry['PageNumber'] for entry in knowledge],
+            # "Grounding Text": [entry['ReconstructedText'] for entry in knowledge]
+        }
+        # Create a dataframe for displaying as a table
+        df = pd.DataFrame(table_data)
+        # Display the table in the sidebar
+        st.sidebar.write("### Knowledge Base Results")
+        st.sidebar.dataframe(df)  # Adjust height as needed
+    else:
+        st.sidebar.write("No relevant knowledge base results found.")

azure_openai.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# openai
+import os
+# from langchain.chat_models import AzureChatOpenAI
+from langchain_openai import AzureChatOpenAI
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
+# Initialize an instance of AzureOpenAI using the specified settings
+# LLM Langchain Definition
+OPENAI_API_KEY = "86b631a9c0294e9698e327c59ff5ac2c"
+OPENAI_API_TYPE = "azure"
+OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
+# OPENAI_API_VERSION = "2024-02-01"
+OPENAI_API_VERSION = "2024-08-01-preview"
+# OPENAI_MODEL = "gpt4-turbo-1106"
+OPENAI_MODEL = "gpt-4o"
+# Initialize an instance of AzureOpenAI using the specified settings
+def get_res(user_query, history, knowledge, systemMessgae, userMessage, temp, tokens):
+    llm = AzureChatOpenAI(
+    openai_api_version=OPENAI_API_VERSION,
+    openai_api_key=OPENAI_API_KEY,
+    azure_endpoint=OPENAI_API_BASE,
+    openai_api_type=OPENAI_API_TYPE,
+    deployment_name=OPENAI_MODEL,
+    temperature=temp,
+    max_tokens=tokens  # Name of the deployment for identification
+)
+    system_message_template = SystemMessagePromptTemplate.from_template(systemMessgae)
+    human_message_template = HumanMessagePromptTemplate.from_template(userMessage)
+    # Create a chat prompt template combining system and human messages
+    prompt = ChatPromptTemplate.from_messages([system_message_template, human_message_template])
+    chain = prompt | llm | StrOutputParser()
+    return chain.stream({
+        "user_question": user_query,
+        "history": history,
+        "knowledge": knowledge
+    })
+def qt(systemMessgae, history, temp, tokens):
+    llm = AzureChatOpenAI(
+    openai_api_version=OPENAI_API_VERSION,
+    openai_api_key=OPENAI_API_KEY,
+    azure_endpoint=OPENAI_API_BASE,
+    openai_api_type=OPENAI_API_TYPE,
+    deployment_name=OPENAI_MODEL,
+    temperature=temp,
+    max_tokens=tokens  # Name of the deployment for identification
+)
+    userMessage = """ Converstaion to date: {history}
+    Please create your optimised query
+"""
+    system_message_template = SystemMessagePromptTemplate.from_template(systemMessgae)
+    human_message_template = HumanMessagePromptTemplate.from_template(userMessage)
+    # Create a chat prompt template combining system and human messages
+    prompt = ChatPromptTemplate.from_messages([system_message_template, human_message_template])
+    chain = prompt | llm | StrOutputParser()
+    return chain.invoke({
+        "history": history,
+    })

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+pinecone
+langchain
+langchain_core
+langchain-openai
+streamlit

retriver.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# Retriever function
+from pinecone import Pinecone
+from langchain_openai import AzureOpenAIEmbeddings
+import uuid
+import pandas as pd
+import streamlit as st
+import os
+# Initialize Pinecone client
+# pc = Pinecone(api_key=st.secrets["PC_API_KEY"])
+pc = Pinecone(api_key="567aca04-6fb0-40a0-ba92-a5ed30be190b")
+index = pc.Index("openai-serverless")
+# Azure OpenAI configuration
+# os.environ["AZURE_OPENAI_API_KEY"] = st.secrets["api_key"]
+os.environ["AZURE_OPENAI_API_KEY"] = "86b631a9c0294e9698e327c59ff5ac2c"
+os.environ["AZURE_OPENAI_ENDPOINT"] = "https://davidfearn-gpt4.openai.azure.com/"
+os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] = "text-embedding-ada-002"
+os.environ["AZURE_OPENAI_API_VERSION"] = "2024-08-01-preview"
+# Model configuration
+embeddings_model = AzureOpenAIEmbeddings(
+    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
+    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
+    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
+)
+df_chunks = pd.read_pickle('Chunks_Complete.pkl')
+def process_search_results(search_results):
+    """
+    Processes search results to extract and organize metadata and other details.
+    :param search_results: List of search result matches from Pinecone.
+    :return: A list of dictionaries containing relevant metadata and scores.
+    """
+    processed_results = []
+    for result in search_results:
+        processed_results.append({
+            "id": result['id'],
+            "score": result['score'],
+            "Title": result['metadata'].get('Title', ''),
+            "ChunkText": result['metadata'].get('ChunkText', ''),
+            "PageNumber": result['metadata'].get('PageNumber', ''),
+            "Chunk": result['metadata'].get('Chunk', '')
+        })
+    return processed_results
+def reconstruct_text_from_chunks(df_chunks):
+    """
+    Reconstructs a single string of text from the chunks in the DataFrame.
+    :param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
+    :return: A string combining all chunk texts in order.
+    """
+    return " ".join(df_chunks.sort_values(by=['Chunk'])['ChunkText'].tolist())
+def lookup_related_chunks(df_chunks, chunk_id):
+    """
+    Returns all chunks matching the title and page number of the specified chunk ID,
+    including chunks from the previous and next pages, handling edge cases where
+    there is no preceding or succeeding page.
+    :param df_chunks: DataFrame with columns ['Title', 'Chunk', 'ChunkText', 'TokenCount', 'PageNumber', 'ChunkID']
+    :param chunk_id: The unique ID of the chunk to look up.
+    :return: DataFrame with all chunks matching the title and page range of the specified chunk ID.
+    """
+    target_chunk = df_chunks[df_chunks['ChunkID'] == chunk_id]
+    if target_chunk.empty:
+        raise ValueError("Chunk ID not found")
+    title = target_chunk.iloc[0]['Title']
+    page_number = target_chunk.iloc[0]['PageNumber']
+    # Determine the valid page range
+    min_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].min()
+    max_page = df_chunks[df_chunks['Title'] == title]['PageNumber'].max()
+    page_range = [page for page in [page_number - 1, page_number, page_number + 1] if min_page <= page <= max_page]
+    return df_chunks[(df_chunks['Title'] == title) & (df_chunks['PageNumber'].isin(page_range))]
+def qt(systemMessgae, history, temp, tokens):
+    res = qt(systemMessgae, history, temp, tokens)
+    return res
+def search_and_reconstruct(query, k):
+    """
+    Combines search, lookup of related chunks, and text reconstruction.
+    :param query: The query string to search for.
+    :param df_chunks: DataFrame with chunk data.
+    :param namespace: Pinecone namespace to search within.
+    :param top_k: Number of top search results to retrieve.
+    :return: A list of dictionaries with document title, page number, and reconstructed text.
+    """
+    search_results = search_knowledgebase(query, k)
+    processed_results = process_search_results(search_results)
+    reconstructed_results = []
+    for result in processed_results:
+        chunk_id = result['id']
+        related_chunks = lookup_related_chunks(df_chunks, chunk_id)
+        reconstructed_text = reconstruct_text_from_chunks(related_chunks)
+        reconstructed_results.append({
+            "Title": result['Title'],
+            "PageNumber": result['PageNumber'],
+            "ReconstructedText": reconstructed_text
+        })
+    return reconstructed_results
+def search_knowledgebase(query, k):
+    namespace="gskRegIntel"
+    """
+    Embeds a query string and searches the vector database for similar entries.
+    :param query: The string to embed and search for.
+    :param namespace: Pinecone namespace to search within.
+    :param top_k: Number of top results to retrieve.
+    :return: List of search results with metadata and scores.
+    """
+    try:
+        # Generate embedding for the query
+        query_embedding = embeddings_model.embed_query(query)
+        # Perform search in Pinecone
+        results = index.query(vector=query_embedding, top_k=k, namespace=namespace, include_metadata=True)
+        return results.matches
+    except Exception as e:
+        print(f"Error during search: {e}")
+        return []