File size: 4,653 Bytes
fd9ca75 837390a d9eb9d8 fd9ca75 09e12a6 ccd1325 fd9ca75 800d5f0 fd9ca75 71a8ec9 fd9ca75 28f8ec9 a53bc7d 277e13c a53bc7d 28f8ec9 8a087e4 277e13c 03676b1 277e13c cdf1ed1 28f8ec9 a53bc7d fd9ca75 ac49d87 fd9ca75 fe5933c fd9ca75 fe5933c fd9ca75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# -*- coding: utf-8 -*-
"""Doc_chat_vegleges_like.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Igjhvd8GhC8qJf7syPEa2x0KKjroy7KV
# Setting up environment
"""
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import ElasticVectorSearch, Pinecone, Weaviate
from langchain_community.vectorstores import FAISS
# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
print(os.environ["OPENAI_API_KEY"])
print(os.environ["DATASET_ACCES"],'HALOOO')
"""# Preprocessing document"""
# location of the pdf file/files.
reader = PdfReader('samu-en-567.pdf')
#reader = PdfReader('/content/WOW.pdf')
#reader = PdfReader('/content/the_little_prince.pdf')
#reader = PdfReader('/content/constitution.pdf')
# read data from the file
raw_text = ''
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text:
raw_text += text
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.
text_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 800,
chunk_overlap = 150,
length_function = len,
)
texts = text_splitter.split_text(raw_text)
len(texts)
"""## Setting up doc search"""
embeddings = OpenAIEmbeddings()
doc_search = FAISS.from_texts(texts, embeddings)
"""# Setting up chatbot"""
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferWindowMemory
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
template = """You are a chatbot having a conversation with a human.
Given the following extracted parts of a long document and a question, create a final answer based on the document ONLY and NOTHING else.
Any questions outside of the document is irrelevant and you certanly dont know! If You cannot find the answer say "The document does not contain that information."
{context}
{chat_history}
Human: {human_input}
Chatbot:"""
prompt = PromptTemplate(
input_variables=["chat_history", "human_input", "context"], template=template
)
memory = ConversationBufferWindowMemory(memory_key="chat_history", input_key="human_input",k=3)
chain = load_qa_chain( OpenAI(), chain_type="stuff", memory=memory, prompt=prompt)
"""# Demo
## Setting up methods
"""
def chat(query,history):
docs = doc_search.similarity_search(query)
return chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text']
"""## Setting up UI with gradio"""
import gradio as gr
from huggingface_hub import HfFileSystem
fs = HfFileSystem(token=os.environ.get('DATASET_ACCES'))
def write_to_file(file_name,content):
file_path = f"datasets/mgreg555/samu_reference_book/" + file_name
with fs.open(file_path, "r") as file_old:
content_old = file_old.read()
print(content_old)
with fs.open(file_path, "w") as file:
file.write(f"{content_old}\n" + content)
# Example usage
def vote(tmp, index_state, data: gr.LikeData):
value_new = data.value
index_new = data.index
file_name = 'good.txt' if data.liked else 'bad.txt'
write_to_file(file_name, value_new + ';' + find_previous_question(value_new))
def find_previous_question(answer_string):
lines = chain.memory.buffer.split('\n')
last_question = None
current_question = None
for line in lines:
if line.startswith('Human:'):
current_question = line[7:].strip() # Extract the question by removing the 'Human:' prefix
elif line.startswith('AI:') and line[3:].strip() == answer_string:
return current_question # Return the previous question when the answer is found
return None
chatbot = gr.Chatbot(height=600, likeable=True)
# Use gradio.Blocks to create a context for your components and event listeners
with gr.Blocks() as demo:
index_state = gr.State(value=[])
tmp = gr.Textbox(visible=False, value="")
gr.ChatInterface(
chat,
chatbot=chatbot,
title="Doc-chat",
description="Ask about SAMU!",
theme="soft",
examples=["What is SAMU?","What is the capital of France?"],
cache_examples=True,
retry_btn=None,
undo_btn="Delete Previous",
clear_btn="Clear",
)
chatbot.like(vote, [tmp, index_state], [tmp, index_state])
demo.launch() |