mgreg555 commited on
Commit
fd9ca75
·
verified ·
1 Parent(s): 5bc88f0

Upload doc_chat_vegleges_like.py

Browse files
Files changed (1) hide show
  1. doc_chat_vegleges_like.py +138 -0
doc_chat_vegleges_like.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Doc_chat_vegleges_like.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Igjhvd8GhC8qJf7syPEa2x0KKjroy7KV
8
+
9
+ # Setting up environment
10
+ """
11
+
12
+ from PyPDF2 import PdfReader
13
+ from langchain.embeddings.openai import OpenAIEmbeddings
14
+ from langchain.text_splitter import CharacterTextSplitter
15
+ from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
16
+
17
+ # Get your API keys from openai, you will need to create an account.
18
+ # Here is the link to get the keys: https://platform.openai.com/account/billing/overview
19
+ import os
20
+
21
+
22
+ """# Preprocessing document"""
23
+
24
+ # location of the pdf file/files.
25
+ reader = PdfReader('samu-en-567.pdf')
26
+ #reader = PdfReader('/content/WOW.pdf')
27
+ #reader = PdfReader('/content/the_little_prince.pdf')
28
+ #reader = PdfReader('/content/constitution.pdf')
29
+
30
+ # read data from the file
31
+ raw_text = ''
32
+ for i, page in enumerate(reader.pages):
33
+ text = page.extract_text()
34
+ if text:
35
+ raw_text += text
36
+
37
+ # We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.
38
+
39
+ text_splitter = CharacterTextSplitter(
40
+ separator = "\n",
41
+ chunk_size = 1000,
42
+ chunk_overlap = 200,
43
+ length_function = len,
44
+ )
45
+ texts = text_splitter.split_text(raw_text)
46
+
47
+ len(texts)
48
+
49
+ """## Setting up doc search"""
50
+
51
+ embeddings = OpenAIEmbeddings()
52
+ doc_search = FAISS.from_texts(texts, embeddings)
53
+
54
+ """# Setting up chatbot"""
55
+
56
+ from langchain.chains.question_answering import load_qa_chain
57
+ from langchain.memory import ConversationBufferWindowMemory
58
+ from langchain.prompts import PromptTemplate
59
+ from langchain_openai import OpenAI
60
+
61
+ template = """You are a chatbot having a conversation with a human.
62
+
63
+ Given the following extracted parts of a long document and a question, create a final answer based on the document ONLY and NOTHING else.
64
+ Any questions outside of the document is irrelevant and you certanly dont know! If You cannot find the answer say "The document does not contain that information."
65
+
66
+ {context}
67
+
68
+ {chat_history}
69
+ Human: {human_input}
70
+ Chatbot:"""
71
+
72
+ prompt = PromptTemplate(
73
+ input_variables=["chat_history", "human_input", "context"], template=template
74
+ )
75
+
76
+ memory = ConversationBufferWindowMemory(memory_key="chat_history", input_key="human_input",k=3)
77
+ chain = load_qa_chain( OpenAI(), chain_type="stuff", memory=memory, prompt=prompt)
78
+
79
+ """# Demo
80
+
81
+ ## Setting up methods
82
+ """
83
+
84
+ def chat(query,history):
85
+ docs = doc_search.similarity_search(query)
86
+ return chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text']
87
+
88
+ """## Setting up UI with gradio"""
89
+
90
+ import gradio as gr
91
+
92
+ def write_to_file(file_name, value):
93
+ with open(file_name, 'a', encoding='utf-8') as file:
94
+ file.write(find_previous_question(value) + ';' + str(value) + '\n')
95
+
96
+ def vote(tmp, index_state, data: gr.LikeData):
97
+ value_new = data.value
98
+ index_new = data.index
99
+ file_name = 'good.txt' if data.liked else 'bad.txt'
100
+ write_to_file(file_name, value_new)
101
+
102
+ def find_previous_question(answer_string):
103
+ # Split the chat string into lines
104
+ lines = chain.memory.buffer.split('\n')
105
+
106
+ # Initialize variables to keep track of the last question and the current question
107
+ last_question = None
108
+ current_question = None
109
+
110
+ for line in lines:
111
+ if line.startswith('Human:'):
112
+ current_question = line[7:].strip() # Extract the question by removing the 'Human:' prefix
113
+ elif line.startswith('AI:') and line[3:].strip() == answer_string:
114
+ return current_question # Return the previous question when the answer is found
115
+
116
+ return None
117
+
118
+ chatbot = gr.Chatbot(height=600, likeable=True)
119
+
120
+ # Use gradio.Blocks to create a context for your components and event listeners
121
+ with gr.Blocks() as demo:
122
+ index_state = gr.State(value=[])
123
+ tmp = gr.Textbox(visible=False, value="")
124
+ gr.ChatInterface(
125
+ chat,
126
+ chatbot=chatbot,
127
+ title="Doc-chat",
128
+ description="Ask about the constitution!",
129
+ theme="soft",
130
+ examples=["Who wrote the constitution?","What is the capital of France?"],
131
+ cache_examples=True,
132
+ retry_btn=None,
133
+ undo_btn="Delete Previous",
134
+ clear_btn="Clear",
135
+ )
136
+ chatbot.like(vote, [tmp, index_state], [tmp, index_state])
137
+
138
+ demo.launch()