Spaces:

IIIACSIC
/

CongresoRAG

Build error

File size: 5,068 Bytes

# Import
import urllib.parse
import streamlit as st
from RAG_public import RAG
from congreso import congreso as c
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage, AIMessage


# Seperate page_content and data
def get_pagecontent_metadata(data):
    """
    Separetes page content and metadata of the given document 

    Parameters
    ---------
        data: dict
            Document that has various features such as "id", "mensaje" and "texto"...
    Returns
    -------
        pagecontent_metadata: dict
            Creates key/value pairs for page content and metadata.\n
            "texto" is used for page content, and the rest of the information is used for metadata
    """

    # Checks if values are None
    # If, then redefine them as empty string
    # Else, returns its value, or empty string if its value not given
    for key in data.keys():
        if data[key] == None:
            data[key] = ""
        else:
            data[key] = data.get(key, "")

    search_base_url = "https://www.congreso.es"

    if data["pdf_url"] != "":
        data["pdf_url"] = search_base_url + urllib.parse.quote(data["pdf_url"])

    # Defines pagecontent and metadata information
    pagecontent_metadata = {
        "metadata": {key: data.get(key) for key in data.keys() if key != "texto"},
        "page_content" : data["texto"]}
    return pagecontent_metadata

# Load data
def read_data():
    """
    Returns list of documents after reading each document. Uses get_pagecontent_metadata function
    to seperate content from metadata.
    
    Returns
    ----------
        docs: list
            Document from langchain.schema.document inside a docs list
    """

    # Reads Readme txt files to get information about Congreso RAG and Dataset
    with open("About_CongresoRAG/CongresoRAG-README.txt") as file:
        CongresoRAG_readme = file.read().replace("\n", "")

    with open("About_CongresoRAG/Dataset-README.txt") as file:
        Dataset_readme = file.read().replace("\n", "")
    
    # Put page_content and metadata of these txt file into Document format
    doc_CongresoRAG = Document(page_content=CongresoRAG_readme, metadata={"pdf_url":"https://huggingface.co/spaces/IIIACSIC/CongresoRAG/blob/main/About_CongresoRAG/CongresoRAG-README.txt"})
    doc_Dataset = Document(page_content=Dataset_readme, metadata={"pdf_url":"https://zenodo.org/records/11195944"})

    # Creates docs list to store each documents
    docs = [doc_CongresoRAG, doc_Dataset]
    terms = ["XV"]
    t = c.load_jsons(terms)
    for i in range(0, 100):
        pagecontent_metadata = get_pagecontent_metadata(t["XV"][i])
        document = Document(page_content=pagecontent_metadata["page_content"], metadata=pagecontent_metadata["metadata"])
        docs.append(document)
    return docs

# UI (User Interface)
def main():
    """
    Sets page configuration and title\n
    Reads documents if it is not read yet\n
    Calls rag model if it is not called yet\n
    Creates chat history if it is not created yet\n
    Creates sidebor to display chat history\n
    Takes user query and connects to the rag model\n
    Get response from the rag model and displays it on the screen\n
    """

    # Set page configuration
    st.set_page_config(page_title="CongresoRAG", page_icon="shark")
    st.title("CongresoRAG")
    st.markdown("<small><i style='color: grey;'>Designed by IIIA-CSIC</i></small>", unsafe_allow_html=True)

    # Read documents, if it does not read
    if "documents" not in st.session_state:
        st.session_state.documents = read_data()

    # Calls RAG model if it does not called
    if "rag" not in st.session_state:
        st.session_state.rag = RAG(document=st.session_state.documents)
        st.session_state.rag.model()

    # Create chat history to store previous question/answer
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []

    # Defines user query when it is entered by user
    user_query = st.chat_input("Message CongresoRAG")

    # Creates platform to store chat history
    for message in st.session_state.chat_history:
        if isinstance(message, HumanMessage):
            with st.chat_message("human"):
                st.markdown(message.content)
        else:
            with st.chat_message("ai"):
                st.markdown(message.content)

    # Takes user query and gets response from the rag model, and store them in chat history
    if user_query != None and user_query != "":
        st.session_state.chat_history.append(HumanMessage(user_query))

        with st.chat_message("human"):
            st.markdown(user_query)

        with st.chat_message("ai"):
            ai_response0, ai_response1 = st.session_state.rag.conversational_rag_chain(user_query)

            ai_response = ai_response0 + "\n\n" + "\n\n".join(ai_response1)
            st.session_state.chat_history.append(AIMessage(ai_response))
            st.markdown(ai_response)
            
# Calls main function
if __name__ == "__main__":
    main()