# -*- coding: utf-8 -*- """Doc_chat_vegleges_like.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1Igjhvd8GhC8qJf7syPEa2x0KKjroy7KV # Setting up environment """ from PyPDF2 import PdfReader from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain_community.vectorstores import ElasticVectorSearch, Pinecone, Weaviate from langchain_community.vectorstores import FAISS # Get your API keys from openai, you will need to create an account. # Here is the link to get the keys: https://platform.openai.com/account/billing/overview import os print(os.environ["OPENAI_API_KEY"]) """# Preprocessing document""" # location of the pdf file/files. reader = PdfReader('The_Little_Prince.pdf') # read data from the file raw_text = '' for i, page in enumerate(reader.pages): text = page.extract_text() if text: raw_text += text # We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits. text_splitter = CharacterTextSplitter( separator = "\n", chunk_size = 800, chunk_overlap = 150, length_function = len, ) texts = text_splitter.split_text(raw_text) len(texts) """## Setting up doc search""" embeddings = OpenAIEmbeddings() doc_search = FAISS.from_texts(texts, embeddings) """# Setting up chatbot""" from langchain.chains.question_answering import load_qa_chain from langchain.memory import ConversationBufferWindowMemory from langchain.prompts import PromptTemplate from langchain_openai import OpenAI template = """You are a chatbot having a conversation with a human. Given the following extracted parts of a long document and a question, create a final answer based on the document ONLY and NOTHING else. If You cannot find the answer say "The document does not contain that information." {context} {chat_history} Human: {human_input} Chatbot:""" prompt = PromptTemplate( input_variables=["chat_history", "human_input", "context"], template=template ) memory = ConversationBufferWindowMemory(memory_key="chat_history", input_key="human_input",k=3) chain = load_qa_chain( OpenAI(), chain_type="stuff", memory=memory, prompt=prompt) """# Demo ## Setting up methods """ def chat(query,history): docs = doc_search.similarity_search(query) return chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text'] """## Setting up UI with gradio""" import gradio as gr from huggingface_hub import HfFileSystem fs = HfFileSystem(token=os.environ.get('DATASET_ACCES')) def write_to_file(file_name,content): file_path = f"datasets/mgreg555/Little_Prince/" + file_name with fs.open(file_path, "r") as file_old: content_old = file_old.read() print(content_old) with fs.open(file_path, "w") as file: file.write(f"{content_old}\n" + content) def vote(tmp, index_state, data: gr.LikeData): value_new = data.value index_new = data.index file_name = 'good.txt' if data.liked else 'bad.txt' write_to_file(file_name, value_new + ';' + find_previous_question(value_new)) def find_previous_question(answer_string): lines = chain.memory.buffer.split('\n') last_question = None current_question = None for line in lines: if line.startswith('Human:'): current_question = line[7:].strip() # Extract the question by removing the 'Human:' prefix elif line.startswith('AI:') and line[3:].strip() == answer_string: return current_question # Return the previous question when the answer is found return None chatbot = gr.Chatbot(height=600, likeable=True) # Use gradio.Blocks to create a context for your components and event listeners with gr.Blocks() as demo: index_state = gr.State(value=[]) tmp = gr.Textbox(visible=False, value="") gr.ChatInterface( chat, chatbot=chatbot, title="Doc-chat", description="Ask about The Little Prince!", theme="soft", examples=["Who is the Little Prince?","What is the capital of France?"], cache_examples=True, retry_btn=None, undo_btn="Delete Previous", clear_btn="Clear", ) chatbot.like(vote, [tmp, index_state], [tmp, index_state]) demo.launch()