| |
| """Doc_chat_vegleges_like.ipynb |
| |
| Automatically generated by Colaboratory. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1Igjhvd8GhC8qJf7syPEa2x0KKjroy7KV |
| |
| # Setting up environment |
| """ |
|
|
| from PyPDF2 import PdfReader |
| from langchain.embeddings.openai import OpenAIEmbeddings |
| from langchain.text_splitter import CharacterTextSplitter |
| from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS |
|
|
| |
| |
| import os |
|
|
|
|
| """# Preprocessing document""" |
|
|
| |
| reader = PdfReader('samu-en-567.pdf') |
| |
| |
| |
|
|
| |
| raw_text = '' |
| for i, page in enumerate(reader.pages): |
| text = page.extract_text() |
| if text: |
| raw_text += text |
|
|
| |
|
|
| text_splitter = CharacterTextSplitter( |
| separator = "\n", |
| chunk_size = 1000, |
| chunk_overlap = 200, |
| length_function = len, |
| ) |
| texts = text_splitter.split_text(raw_text) |
|
|
| len(texts) |
|
|
| """## Setting up doc search""" |
|
|
| embeddings = OpenAIEmbeddings() |
| doc_search = FAISS.from_texts(texts, embeddings) |
|
|
| """# Setting up chatbot""" |
|
|
| from langchain.chains.question_answering import load_qa_chain |
| from langchain.memory import ConversationBufferWindowMemory |
| from langchain.prompts import PromptTemplate |
| from langchain_openai import OpenAI |
|
|
| template = """You are a chatbot having a conversation with a human. |
| |
| Given the following extracted parts of a long document and a question, create a final answer based on the document ONLY and NOTHING else. |
| Any questions outside of the document is irrelevant and you certanly dont know! If You cannot find the answer say "The document does not contain that information." |
| |
| {context} |
| |
| {chat_history} |
| Human: {human_input} |
| Chatbot:""" |
|
|
| prompt = PromptTemplate( |
| input_variables=["chat_history", "human_input", "context"], template=template |
| ) |
|
|
| memory = ConversationBufferWindowMemory(memory_key="chat_history", input_key="human_input",k=3) |
| chain = load_qa_chain( OpenAI(), chain_type="stuff", memory=memory, prompt=prompt) |
|
|
| """# Demo |
| |
| ## Setting up methods |
| """ |
|
|
| def chat(query,history): |
| docs = doc_search.similarity_search(query) |
| return chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text'] |
|
|
| """## Setting up UI with gradio""" |
|
|
| import gradio as gr |
| from huggingface_hub import HfFileSystem |
|
|
| fs = HfFileSystem(token=os.environ["acces_token"]) |
|
|
| def write_to_file(file_name,content): |
| file_path = f"spaces/mgreg555/docs_chat/" + file_name |
| with fs.open(file_path, "a") as file: |
| file.write(content) |
|
|
| |
| |
|
|
| def vote(tmp, index_state, data: gr.LikeData): |
| value_new = data.value |
| index_new = data.index |
| file_name = 'good.txt' if data.liked else 'bad.txt' |
| write_to_file(file_name, value_new) |
|
|
| def find_previous_question(answer_string): |
| |
| lines = chain.memory.buffer.split('\n') |
|
|
| |
| last_question = None |
| current_question = None |
|
|
| for line in lines: |
| if line.startswith('Human:'): |
| current_question = line[7:].strip() |
| elif line.startswith('AI:') and line[3:].strip() == answer_string: |
| return current_question |
|
|
| return None |
|
|
| chatbot = gr.Chatbot(height=600, likeable=True) |
|
|
| |
| with gr.Blocks() as demo: |
| index_state = gr.State(value=[]) |
| tmp = gr.Textbox(visible=False, value="") |
| gr.ChatInterface( |
| chat, |
| chatbot=chatbot, |
| title="Doc-chat", |
| description="Ask about the constitution!", |
| theme="soft", |
| examples=["Who wrote the constitution?","What is the capital of France?"], |
| cache_examples=True, |
| retry_btn=None, |
| undo_btn="Delete Previous", |
| clear_btn="Clear", |
| ) |
| chatbot.like(vote, [tmp, index_state], [tmp, index_state]) |
|
|
| demo.launch() |