Spaces:

AptusAI
/

Chat-EUR-Lex

Build error

App Files Files Community

MattiaSangermano commited on May 17, 2024

Commit

823b9f5

1 Parent(s): f3a325b

Added app code

Browse files

Files changed (5) hide show

EurLexChat.py +272 -0
app.py +144 -0
chat_utils.py +100 -0
config.yaml +40 -0
output/OaaEABb.json +0 -0

EurLexChat.py ADDED Viewed

	@@ -0,0 +1,272 @@

+from langchain_community.vectorstores import Qdrant
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_core.runnables.base import RunnableLambda
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.tools import StructuredTool
+from langchain_core.utils.function_calling import convert_to_openai_tool
+from langchain_core.messages import AIMessage
+from typing import List
+from chat_utils import get_init_modules, SYSTEM_PROMPT, SYSTEM_PROMPT_LOOP, ContextInput, Answer
+from langchain_core.documents.base import Document
+class EurLexChat:
+    def __init__(self, config: dict):
+        self.config = config
+        self.max_history_messages = self.config["max_history_messages"]
+        self.use_functions = (
+            'use_context_function' in config["llm"] and
+            config["llm"]["use_context_function"] and
+            config["llm"]["class"] == "ChatOpenAI")
+        self.embedder, self.llm, self.chatDB_class, self.retriever = get_init_modules(config)
+        self.max_context_size = config["llm"]["max_context_size"]
+        self.prompt = ChatPromptTemplate.from_messages([
+            ("system", SYSTEM_PROMPT),
+            MessagesPlaceholder(variable_name="history"),
+            ("human", "{question}"),
+        ])
+        self.prompt_loop = ChatPromptTemplate.from_messages([
+            ("system", SYSTEM_PROMPT_LOOP),
+            ("human", "History: {history}. Message:"),
+        ])
+        self.chain_loop_answer = ( self.prompt_loop | self.llm )
+        if self.use_functions:
+            GET_CONTEXT_TOOL = StructuredTool.from_function(
+                func=self.get_context,
+                name="get_context",
+                description="To be used whenever the provided context is empty or the user changes the topic of the conversation and you need the context for the topic. " +
+                "This function must be called only when is strictly necessary. " +
+                "This function must not be called if you already have the information to answer the user. ",
+                args_schema=ContextInput
+            )
+            # self.llm = self.llm.bind(tools=[convert_to_openai_tool(GET_CONTEXT_TOOL)])
+            self.llm_with_functions = self.llm.bind(tools=[convert_to_openai_tool(GET_CONTEXT_TOOL)])
+            chain = self.prompt | RunnableLambda(self._resize_history) | self.llm_with_functions
+        else:
+            chain = self.prompt | RunnableLambda(self._resize_history) | self.llm
+        self.chain_with_history = RunnableWithMessageHistory(
+            chain,
+            self.get_chat_history,
+            input_messages_key="question",
+            history_messages_key="history",
+        )
+        self.relevant_documents_pipeline = ( self.retriever | self._parse_documents )
+    def _resize_history(self, input_dict):
+        """
+        Resize the message history.
+        Args:
+            input_dict: The llm input containing the message history.
+        Returns:
+            dict: The resized version of the input_dict.
+        """
+        messages = input_dict.messages
+        if (len(messages) - 2) > self.max_history_messages:
+            messages = [messages[0]] + messages[-(self.max_history_messages +1):]
+            input_dict.messages = messages
+        return input_dict
+    def get_chat_history(self, session_id: str):
+        """
+        Retrieve chat history instance for a specific session ID.
+        Args:
+            session_id (str): The unique identifier for the session.
+        Returns:
+            Chat history object: An instance of the appropriate chat history class.
+        """
+        kwargs = self.config["chatDB"]["kwargs"]
+        if self.config["chatDB"]["class"] == 'FileChatMessageHistory':
+            file_path = f"{kwargs['output_path']}/{session_id}.json"
+            return self.chatDB_class(file_path=file_path)
+        else:
+            return self.chatDB_class(session_id=session_id, **kwargs)
+    def _parse_documents(self, docs: List[Document]) -> List[dict]:
+        """
+        Parse a list of documents into a standardized format.
+        Args:
+            docs (List[Document]): A list of documents to parse.
+        Returns:
+            List[dict]: A list of dictionaries, each containing parsed information from the input documents.
+        """
+        parsed_documents = []
+        for doc in docs:
+            parsed_documents.append({
+                'text': doc.page_content,
+                'source': doc.metadata["source"],
+                '_id': doc.metadata["_id"]
+            })
+        return parsed_documents
+    def _format_context_docs(self, context_docs: List[dict]) -> str:
+        """
+        Format a list of documents into a single string.
+        Args:
+            context_docs (List[dict]): A list of dictionaries containing text from context documents.
+        Returns:
+            str: A string containing the concatenated text from all context documents.
+        """
+        context_str = ''
+        for doc in context_docs:
+            context_str += doc['text'] + "\n\n"
+        return context_str
+    def get_relevant_docs(self, question:str) -> List[dict]:
+        """
+        Retrieve relevant documents based on a given question.
+        Args:
+            question (str): The question for which relevant documents are retrieved.
+        Returns:
+            List[dict]: A list of relevant documents.
+        """
+        docs = self.relevant_documents_pipeline.invoke(question)
+        return docs
+    def get_context(self, text:str) -> str:
+        """
+        Retrieve context for a given text.
+        Args:
+            text (str): The text for which context is retrieved.
+        Returns:
+            str: A formatted string containing the relevant documents texts.
+        """
+        docs = self.get_relevant_docs(text)
+        return self._format_context_docs(docs)
+    def _remove_last_messages(self, session_id:str, n:int) -> None:
+        """
+        Remove last n messages from the chat history of a specific session.
+        Args:
+            session_id (str): The session ID for which messages are removed.
+            n (int): The number of last messages to remove.
+        """
+        chat_history = self.get_chat_history(session_id=session_id)
+        message_history = chat_history.messages
+        chat_history.clear()
+        message_history = message_history[:-n]
+        for message in message_history:
+            chat_history.add_message(message)
+    def _format_history(self, session_id:str) -> str:
+        """
+        Format chat history for a specific session into a string.
+        Args:
+            session_id (str): The session ID for which the chat history is formatted.
+        Returns:
+            str: A formatted string containing the chat history for the specified session.
+        """
+        chat_history = self.get_chat_history(session_id).messages
+        formatted_history = ""
+        for message in chat_history:
+            formatted_history += f"{message.type}: {message.content}\n\n"
+        return formatted_history
+    def _resize_context(self, context_docs:List[dict]) -> List[dict]:
+        """
+        Resize the dimension of the context in terms of number of tokens.
+        If the concatenation of document text exceeds max_context_size,
+        the document text is cut off to meet the limit.
+        Args:
+            context_docs (List[dict]): List of formatted documents.
+        Returns:
+            List[dict]: Returns the list of resized documents.
+        """
+        lengths = [self.llm.get_num_tokens(doc['text']) for doc in context_docs]
+        resized_contexts = []
+        total_len = 0
+        for i, l in enumerate(lengths):
+            if l + total_len <= self.max_context_size:
+                resized_contexts.append(context_docs[i])
+                total_len += l
+        return resized_contexts
+    def get_answer(self, session_id:str, question:str, context_docs:List[dict], from_tool:bool=False) -> Answer:
+        """
+        Get an answer to a question of a specific session, considering context documents and history messages.
+        Args:
+            session_id (str): The session ID for which the answer is retrieved.
+            question (str): The new user message.
+            context_docs (List[dict]): A list of documents used as context to answer the user message.
+            from_tool (bool, optional): Whether the question originates from a tool. Defaults to False.
+        Returns:
+            Answer: An object containing the answer along with a new list of context documents
+                if those provided are insufficient to answer the question.
+        """
+        resized_docs = self._resize_context(context_docs)
+        context = self._format_context_docs(resized_docs)
+        result = self.chain_with_history.invoke(
+            {"context": context, "question": question},
+            config={"configurable": {"session_id": session_id}}
+        )
+        if self.use_functions and len(result.additional_kwargs) > 0:
+            if from_tool:
+                self._remove_last_messages(session_id=session_id, n=1)
+                history = self._format_history(session_id)
+                result = self.chain_loop_answer.invoke({'history': history})
+                self.get_chat_history(session_id=session_id).add_message(AIMessage(result.content))
+                return Answer(answer=result.content, status=-1)
+            text = eval(result.additional_kwargs['tool_calls'][0]['function']['arguments'])['text']
+            new_docs = self.get_relevant_docs(text)
+            self._remove_last_messages(session_id=session_id, n=2)
+            result = self.get_answer(
+                session_id=session_id,
+                question=question,
+                context_docs=new_docs,
+                from_tool=True
+            )
+            if result.status == 1:
+                return Answer(answer=result.answer, new_documents=new_docs)
+            else:
+                return Answer(answer=result.answer)
+        return Answer(answer=result.content)

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import gradio as gr
+from EurLexChat import EurLexChat
+import yaml
+import random
+import string
+import argparse
+import os
+openai_org_key = os.getenv("OPENAI_ORG_KEY")
+openai_key = os.getenv("OPENAI_KEY")
+ui_pwd = os.getenv("pwd")
+ui_user = os.getenv("user")
+qdrant_url=os.getenv("url")
+qdrant_key=os.getenv("qdrant_key")
+def generate_random_string(length):
+    # Generate a random string of the specified length
+    # using letters and numbers
+    characters = string.ascii_letters + string.digits
+    random_string = ''.join(random.choice(characters) for _ in range(length))
+    return random_string
+class Documents():
+    def __init__(self) -> None:
+        self.documents = []
+parser = argparse.ArgumentParser(description="Chat-eur-lex ui")
+parser.add_argument('--config_path',
+                    dest='config_path',
+                    metavar='config_path',
+                    type=str,
+                    help='The path to the config file that contains all the settings for the chat engine' ,
+                    default='config.yaml')
+args = parser.parse_args()
+# Read config file
+with open(args.config_path, 'r') as file:
+    config = yaml.safe_load(file)
+config["embeddings"]["kwargs"]["openai_api_key"] = openai_key
+config["embeddings"]["kwargs"]["openai_organization"] = openai_org_key
+config["llm"]["kwargs"]["openai_api_key"] = openai_key
+config["llm"]["kwargs"]["openai_organization"] = openai_org_key
+config["vectorDB"]["kwargs"]["url"] = qdrant_url
+config["vectorDB"]["kwargs"]["api_key"] = qdrant_key
+chat = EurLexChat(config=config)
+docs = Documents()
+def remove_doc(btn):
+    docs.documents.pop(btn)
+    new_accordions, new_texts = set_new_docs_ui(docs.documents)
+    return [*new_accordions, *new_texts]
+def get_answer(message, history, session_id):
+    s = session_id
+    if len(history) == 0:
+        docs.documents = chat.get_relevant_docs(question=message)
+        s = generate_random_string(7)
+    result = chat.get_answer(s, message, docs.documents)
+    history.append((message, result.answer))
+    if result.new_documents:
+        docs.documents = result.new_documents
+    accordions, list_texts = set_new_docs_ui(docs.documents)
+    return ['', history, gr.Column(scale=1, visible=True), *accordions, *list_texts, s]
+def set_new_docs_ui(documents):
+    new_accordions = []
+    new_texts = []
+    for i in range(len(accordions)):
+        if i < len(documents):
+            new_accordions.append(gr.update(accordions[i].elem_id, label=f"{documents[i]['text'][:45]}...", visible=True, open=False))
+            new_texts.append(gr.update(list_texts[i].elem_id, value=f"{documents[i]['text']}...", visible=True))
+        else:
+            new_accordions.append(gr.update(accordions[i].elem_id, label="", visible=False))
+            new_texts.append(gr.update(list_texts[i].elem_id, value="", visible=False))
+    return new_accordions, new_texts
+def clean_page():
+    docs.documents = []
+    accordions, list_texts = set_new_docs_ui(docs.documents)
+    return ["", [], None, *accordions, *list_texts]
+list_texts = []
+accordions = []
+states = []
+delete_buttons = []
+block = gr.Blocks()
+with block:
+    gr.Markdown("""
+        <h1><center>Chat-EUR-Lex prototype - Alpha version</center></h1>
+    """)
+    state = gr.State(value=None)
+    with gr.Row():
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot()
+            with gr.Row():
+                message = gr.Textbox(scale=10)
+                submit = gr.Button("Send", scale=1)
+                clear = gr.Button("Clear", scale=1)
+        with gr.Column(scale=1, visible=False) as col:
+            gr.Markdown("""<h3><center>Context documents</center></h3>""")
+            for i in range(config['vectorDB']['retriever_args']['search_kwargs']['k']):
+                with gr.Accordion(label="", elem_id=f'accordion_{i}', open=False) as acc:
+                    list_texts.append(gr.Textbox("", elem_id=f'text_{i}', show_label=False, lines=10))
+                    btn = gr.Button(f"Remove document")
+                    delete_buttons.append(btn)
+                    states.append(gr.State(i))
+                accordions.append(acc)
+    with gr.Row():
+        with gr.Column(scale=3):
+            gr.HTML("""""")
+            gr.HTML("""<div>
+                    <h3>Disclaimer</h3>
+                    <p><a href="https://github.com/Aptus-AI/chat-eur-lex/">Chat-EUR-Lex prototype</a> is a limited risk AI system realized by the
+                    <a href="https://www.igsg.cnr.it/en/">Institute of Legal Informatics and Judicial Systems (IGSG-CNR)</a> and <a href="https://www.aptus.ai/">Aptus.AI</a>.
+                    The prototype is an AI chatbot, therefore you are interacting with a machine, not with a human person. The prototype uses OpenAI GPT-4 language model. </p>
+                    <p><a href="https://github.com/Aptus-AI/chat-eur-lex/">Chat-EUR-Lex project</a> is funded by the European Union within the framework of the NGI Search project under grant agreement No 101069364.
+                    Views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union or European Commission.
+                    Contact us: <a href="mailto:chat-eur-lex@igsg.cnr.it">chat-eur-lex@igsg.cnr.it</a>.</p>
+                    </div>""")
+    clear.click(clean_page, outputs=[message, chatbot, state, *accordions, *list_texts])
+    message.submit(get_answer, inputs=[message, chatbot, state], outputs=[message, chatbot, col, *accordions, *list_texts, state])
+    submit.click(get_answer, inputs=[message, chatbot, state], outputs=[message, chatbot, col, *accordions, *list_texts, state])
+    for i, b in enumerate(delete_buttons):
+        b.click(remove_doc, inputs=states[i], outputs=[*accordions, *list_texts])
+block.launch(debug=True, auth=(ui_user, ui_pwd))

chat_utils.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from dataclasses import dataclass
+from typing import Optional, List
+from langchain.pydantic_v1 import BaseModel, Field
+SYSTEM_PROMPT = (
+    "You are an assistant  specialized in the legal and compliance field who must answer and converse with the user using the context provided. " +
+    "When you answer the user, if it is relevant, cite the laws and articles you are referring to. NEVER mention the use of context in your answers. "
+    "If you believe the question cannot be answered from the given context, do not make up an answer. Answer in the same language the user is speaking.\n\n ### Context:\n {context}"
+)
+SYSTEM_PROMPT_LOOP = (
+    "You are an assistant who must inform the user that you do not have enough information to answer and ask if the user can provide you with additional information. " +
+    "This answer, must be adapted to the conversation that occurred with the user that is provided to you. Just write down the answer "
+)
+@dataclass
+class Answer():
+    answer: str
+    new_documents: Optional[List] = None
+    status: Optional[int] = 1
+class ContextInput(BaseModel):
+    text: str = Field(
+        title="Text",
+        description="Self-explanatory summary describing what the user is asking for"
+        )
+def get_instance_dynamic_class(lib_path:str, class_name:str, **kwargs):
+    """
+    Instantiate a dynamically imported class from a given library path and class name.
+    Args:
+        lib_path (str): The path to the library/module containing the class.
+        class_name (str): The name of the class to instantiate.
+        **kwargs: Additional keyword arguments to pass to the class constructor.
+    Returns:
+        An instance of the dynamically imported class initialized with the provided arguments.
+    """
+    mod = __import__(lib_path, fromlist=[class_name])
+    dynamic_class = getattr(mod, class_name)
+    return dynamic_class(**kwargs)
+def get_init_modules(config):
+    embedder = get_instance_dynamic_class(
+        lib_path='langchain_community.embeddings',
+        class_name=config["embeddings"]["class"],
+        **config["embeddings"]["kwargs"]
+    )
+    llm = get_instance_dynamic_class(
+        lib_path='langchain_community.chat_models',
+        class_name=config["llm"]["class"],
+        **config["llm"]["kwargs"]
+    )
+    mod_chat = __import__("langchain_community.chat_message_histories",
+                          fromlist=[config["chatDB"]["class"]])
+    chatDB_class = getattr(mod_chat, config["chatDB"]["class"])
+    retriever = get_vectorDB_module(config['vectorDB'], embedder)
+    return embedder, llm, chatDB_class, retriever
+def get_vectorDB_module(db_config, embedder):
+    mod_chat = __import__("langchain_community.vectorstores",
+                          fromlist=[db_config["class"]])
+    vectorDB_class = getattr(mod_chat, db_config["class"])
+    if db_config["class"] == 'Qdrant':
+        from qdrant_client import QdrantClient
+        import inspect
+        # Get QdrantClient init parameters name from signature
+        signature_params = inspect.signature(QdrantClient.__init__).parameters.values()
+        params_to_exclude = ['self', 'kwargs']
+        client_args = [el.name for el in list(signature_params) if el.name not in params_to_exclude]
+        client_kwargs = {k: v for k,
+                         v in db_config['kwargs'].items() if k in client_args}
+        db_kwargs = {
+            k: v for k, v in db_config['kwargs'].items() if k not in client_kwargs}
+        client = QdrantClient(**client_kwargs)
+        retriever = vectorDB_class(
+            client, embeddings=embedder, **db_kwargs).as_retriever(
+                search_type=db_config["retriever_args"]["search_type"],
+                search_kwargs=db_config["retriever_args"]["search_kwargs"]
+        )
+    else:
+        retriever = vectorDB_class(embeddings=embedder, **db_config["kwargs"]).as_retriever(
+            search_type=db_config["retriever_args"]["search_type"],
+            search_kwargs=db_config["retriever_args"]["search_kwargs"]
+        )
+    return retriever

config.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+vectorDB:
+    class: Qdrant
+    kwargs:
+        url: ""
+        api_key: ""
+        collection_name: chat-eur-lex
+    retriever_args:
+        search_type: mmr
+        search_kwargs:
+            k: 15
+            fetch_k: 300
+            score_threshold: 0.0
+            lambda_mult: 0.8
+embeddings:
+    class: OpenAIEmbeddings
+    kwargs:
+        openai_api_key: ""
+        openai_organization: ""
+        model: text-embedding-ada-002
+llm:
+    class: ChatOpenAI
+    use_context_function: True
+    max_context_size: 6000
+    kwargs:
+        openai_organization: ""
+        openai_api_key: ""
+        model_name: gpt-4
+        temperature: 0.8
+chatDB:
+    class: FileChatMessageHistory
+    kwargs:
+        output_path: ./output
+max_history_messages: 5

output/OaaEABb.json ADDED Viewed

File without changes