langchain-ynp-test

Runtime error

ilia_khristoforov commited on Jun 28, 2023

Commit

304e51f

1 Parent(s): 852b083

На ветке pr/5

новый файл: utils/__init__.py
новый файл: utils/bot.py
новый файл: utils/functions.py
изменено: app.py
изменено: requirements.txt

Files changed (3) hide show

utils/__init__.py +3 -0
utils/bot.py +203 -0
utils/functions.py +72 -0

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .bot import Bot
2	+ from .functions import make_documents, make_descriptions
3	+

utils/bot.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import langchain
+from langchain.agents import create_csv_agent
+from langchain.schema import HumanMessage
+from langchain.chat_models import ChatOpenAI
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import Chroma
+from typing import List, Dict
+from langchain.agents import AgentType
+from langchain.chains.conversation.memory import ConversationBufferWindowMemory
+from utils.functions import Matcha_model
+from PIL import Image
+from pathlib import Path
+from langchain.tools import StructuredTool
+from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+class Bot:
+    def __init__(
+            self,
+            openai_api_key: str,
+            file_descriptions: List[Dict[str, any]],
+            text_documents: List[langchain.schema.Document],
+            verbose: bool = False
+    ):
+        self.verbose = verbose
+        self.file_descriptions = file_descriptions
+        self.llm = ChatOpenAI(
+            openai_api_key=openai_api_key,
+            temperature=0,
+            model_name="gpt-3.5-turbo"
+        )
+        embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+        # embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
+        vector_store = Chroma.from_documents(text_documents, embedding_function)
+        self.text_retriever = langchain.chains.RetrievalQAWithSourcesChain.from_chain_type(
+            llm=self.llm,
+            chain_type='stuff',
+            retriever=vector_store.as_retriever()
+        )
+        self.text_search_tool = langchain.agents.Tool(
+            func=self._text_search,
+            description="Use this tool when searching for text information",
+            name="search text information"
+        )
+        self.chart_model = Matcha_model()
+    def __call__(
+            self,
+            question: str
+    ):
+        self.tools = []
+        self.tools.append(self.text_search_tool)
+        file = self._define_appropriate_file(question)
+        if file != "None of the files":
+            number = int(file[file.find('№')+1:])
+            file_description = [x for x in self.file_descriptions if x['number'] == number][0]
+            file_path = file_description['path']
+            if Path(file).suffix == '.csv':
+                self.csv_agent = create_csv_agent(
+                    llm=self.llm,
+                    path=file_path,
+                    verbose=self.verbose
+                )
+                self._init_tabular_search_tool(file_description)
+                self.tools.append(self.tabular_search_tool)
+            else:
+                self._init_chart_search_tool(file_description)
+                self.tools.append(self.chart_search_tool)
+        self._init_chatbot()
+        # print(file)
+        response = self.agent(question)
+        return response
+    def _init_chatbot(self):
+        conversational_memory = ConversationBufferWindowMemory(
+            memory_key='chat_history',
+            k=5,
+            return_messages=True
+        )
+        self.agent = langchain.agents.initialize_agent(
+            agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
+            tools=self.tools,
+            llm=self.llm,
+            verbose=self.verbose,
+            max_iterations=5,
+            early_stopping_method='generate',
+            memory=conversational_memory
+        )
+        sys_msg = (
+            "You are an expert summarizer and deliverer of information. "
+            "Yet, the reason you are so intelligent is that you make complex "
+            "information incredibly simple to understand. It's actually rather incredible."
+            "When users ask information you refer to the relevant tools."
+            "if one of the tools helped you with only a part of the necessary information, you must "
+            "try to find the missing information using another tool"
+            "if you can't find the information using the provided tools, you MUST "
+            "say 'I don't know'. Don't try to make up an answer."
+        )
+        prompt = self.agent.agent.create_prompt(
+            tools=self.tools,
+            prefix = sys_msg
+        )
+        self.agent.agent.llm_chain.prompt = prompt
+    def _text_search(
+            self,
+            query: str
+    ) -> str:
+        query = self.text_retriever.prep_inputs(query)
+        res = self.text_retriever(query)['answer']
+        return res
+    def _tabular_search(
+            self,
+            query: str
+    ) -> str:
+        res = self.csv_agent.run(query)
+        return res
+    def _chart_search(
+        self,
+        image,
+        query: str
+    ) -> str:
+        image = Image.open(image)
+        res = self.chart_model.chart_qa(image, query)
+        return res
+    def _init_chart_search_tool(
+        self,
+        title: str
+    ) -> None:
+        title = title
+        description = f"""
+            Use this tool when searching for information on charts.
+            With this tool you can answer the question about related chart.
+            You should ask simple question about a chart, then the tool will give you number.
+            This chart is called {title}.
+        """
+        self.chart_search_tool = StructuredTool(
+            func=self._chart_search,
+            description=description,
+            name="Ask over charts"
+        )
+    def _init_tabular_search_tool(
+            self,
+            file_: Dict[str, any]
+    ) -> None:
+        description = f"""
+            Use this tool when searching for tabular information.
+            With this tool you could get access to table.
+            This table title is "{title}" and the names of the columns in this table: {columns}
+        """
+        self.tabular_search_tool = langchain.agents.Tool(
+            func=self._tabular_search,
+            description=description,
+            name="search tabular information"
+        )
+    def _define_appropriate_file(
+            self,
+            question: str
+    ) -> str:
+        ''' Определяет по описаниям таблиц в какой из них может содержаться ответ на вопрос.
+        Возвращает номер таблицы по шаблону "Table №1" или "None of the tables" '''
+        message = 'I have list of descriptions: \n'
+        k = 0
+        for description in self.file_descriptions:
+            k += 1
+            str_description = f"""  {k}) description for File №{description['number']}: """
+            for key, value in description.items():
+                string_val = str(key) + ' : ' + str(value) + '\n'
+                str_description += string_val
+            message += str_description
+        print(message)
+        question = f""" How do you think, which file can help answer the question: "{question}" .
+        Your answer MUST be specific,
+        for example if you think that File №2 can help answer the question, you MUST just write  "File №2!".
+        If you think that none of the files can help answer the question just write "None of the files!"
+        Don't include to answer information about your thinking.
+        """
+        message += question
+        res = self.llm([HumanMessage(content=message)])
+        print(res.content)
+        print(res.content[:-1])
+        return res.content[:-1]

utils/functions.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import pandas as pd
+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import CharacterTextSplitter
+import torch
+from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+from pathlib import Path
+def make_descriptions(file, title):
+    if Path(file).suffix == '.csv':
+        # print(file)
+        df = pd.read_csv(file)
+        print(df.head())
+        columns = list(df.columns)
+        print(columns)
+        table_description0 = {
+            'path': 'random',
+            'number': 1,
+            'columns': ["clothes", "animals", "students"],
+            'title': "fashionable student clothes"
+        }
+        table_description1 = {
+            'path': file,
+            'number': 2,
+            'columns': columns,
+            'title': title
+        }
+        table_descriptions = [table_description0, table_description1]
+        return table_descriptions
+    else:
+        file_description = {
+            'path': file,
+            'number': 1,
+            'title': title
+        }
+        file_descriptions = [file_description]
+        return file_descriptions
+def make_documents(pdf):
+    loader = PyPDFLoader(pdf)
+    documents = loader.load()
+    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0, separator='\n')
+    documents = text_splitter.split_documents(documents)
+    return documents
+class Matcha_model:
+    def __init__(self) -> None:
+        # torch.hub.download_url_to_file('https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/20294671002019.png', 'chart_example.png')
+        # torch.hub.download_url_to_file('https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/test/png/multi_col_1081.png', 'chart_example_2.png')
+        # torch.hub.download_url_to_file('https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/test/png/18143564004789.png', 'chart_example_3.png')
+        # torch.hub.download_url_to_file('https://sharkcoder.com/files/article/matplotlib-bar-plot.png', 'chart_example_4.png')
+        self.model_name = "google/matcha-chartqa"
+        self.model = Pix2StructForConditionalGeneration.from_pretrained(self.model_name)
+        self.processor = Pix2StructProcessor.from_pretrained(self.model_name)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+    def _filter_output(self, output):
+        return output.replace("<0x0A>", "")
+    def chart_qa(self, image, question: str) -> str:
+        inputs = self.processor(images=image, text=question, return_tensors="pt").to(self.device)
+        predictions = self.model.generate(**inputs, max_new_tokens=512)
+        return self._filter_output(self.processor.decode(predictions[0], skip_special_tokens=True))