mgreg555 commited on
Commit
73635cb
·
verified ·
1 Parent(s): 97fde7d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -0
app.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Doc_chat_vegleges_like.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Igjhvd8GhC8qJf7syPEa2x0KKjroy7KV
8
+
9
+ # Setting up environment
10
+ """
11
+
12
+ from PyPDF2 import PdfReader
13
+ from langchain.embeddings.openai import OpenAIEmbeddings
14
+ from langchain.text_splitter import CharacterTextSplitter
15
+ from langchain_community.vectorstores import ElasticVectorSearch, Pinecone, Weaviate
16
+ from langchain_community.vectorstores import FAISS
17
+
18
+ # Get your API keys from openai, you will need to create an account.
19
+ # Here is the link to get the keys: https://platform.openai.com/account/billing/overview
20
+ import os
21
+
22
+ print(os.environ["OPENAI_API_KEY"])
23
+
24
+ """# Preprocessing document"""
25
+
26
+ # location of the pdf file/files.
27
+ reader = PdfReader('little_prince.pdf')
28
+
29
+ # read data from the file
30
+ raw_text = ''
31
+ for i, page in enumerate(reader.pages):
32
+ text = page.extract_text()
33
+ if text:
34
+ raw_text += text
35
+
36
+ # We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.
37
+
38
+ text_splitter = CharacterTextSplitter(
39
+ separator = "\n",
40
+ chunk_size = 800,
41
+ chunk_overlap = 150,
42
+ length_function = len,
43
+ )
44
+ texts = text_splitter.split_text(raw_text)
45
+
46
+ len(texts)
47
+
48
+ """## Setting up doc search"""
49
+
50
+ embeddings = OpenAIEmbeddings()
51
+ doc_search = FAISS.from_texts(texts, embeddings)
52
+
53
+ """# Setting up chatbot"""
54
+
55
+ from langchain.chains.question_answering import load_qa_chain
56
+ from langchain.memory import ConversationBufferWindowMemory
57
+ from langchain.prompts import PromptTemplate
58
+ from langchain_openai import OpenAI
59
+
60
+ template = """You are a chatbot having a conversation with a human.
61
+
62
+ Given the following extracted parts of a long document and a question, create a final answer based on the document ONLY and NOTHING else.
63
+ Any questions outside of the document is irrelevant and you certanly dont know! If You cannot find the answer say "The document does not contain that information."
64
+
65
+ {context}
66
+
67
+ {chat_history}
68
+ Human: {human_input}
69
+ Chatbot:"""
70
+
71
+ prompt = PromptTemplate(
72
+ input_variables=["chat_history", "human_input", "context"], template=template
73
+ )
74
+
75
+ memory = ConversationBufferWindowMemory(memory_key="chat_history", input_key="human_input",k=3)
76
+ chain = load_qa_chain( OpenAI(), chain_type="stuff", memory=memory, prompt=prompt)
77
+
78
+ """# Demo
79
+
80
+ ## Setting up methods
81
+ """
82
+
83
+ def chat(query,history):
84
+ docs = doc_search.similarity_search(query)
85
+ return chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text']
86
+
87
+ """## Setting up UI with gradio"""
88
+
89
+ import gradio as gr
90
+ from huggingface_hub import HfFileSystem
91
+
92
+ fs = HfFileSystem(token=os.environ.get('DATASET_ACCES'))
93
+
94
+ def write_to_file(file_name,content):
95
+ file_path = f"datasets/mgreg555/samu_reference_book/" + file_name
96
+ with fs.open(file_path, "r") as file_old:
97
+ content_old = file_old.read()
98
+ print(content_old)
99
+
100
+ with fs.open(file_path, "w") as file:
101
+ file.write(f"{content_old}\n" + content)
102
+
103
+ # Example usage
104
+
105
+
106
+ def vote(tmp, index_state, data: gr.LikeData):
107
+ value_new = data.value
108
+ index_new = data.index
109
+ file_name = 'good.txt' if data.liked else 'bad.txt'
110
+ write_to_file(file_name, value_new + ';' + find_previous_question(value_new))
111
+
112
+ def find_previous_question(answer_string):
113
+ lines = chain.memory.buffer.split('\n')
114
+ last_question = None
115
+ current_question = None
116
+
117
+ for line in lines:
118
+ if line.startswith('Human:'):
119
+ current_question = line[7:].strip() # Extract the question by removing the 'Human:' prefix
120
+ elif line.startswith('AI:') and line[3:].strip() == answer_string:
121
+ return current_question # Return the previous question when the answer is found
122
+
123
+ return None
124
+
125
+ chatbot = gr.Chatbot(height=600, likeable=True)
126
+
127
+ # Use gradio.Blocks to create a context for your components and event listeners
128
+ with gr.Blocks() as demo:
129
+ index_state = gr.State(value=[])
130
+ tmp = gr.Textbox(visible=False, value="")
131
+ gr.ChatInterface(
132
+ chat,
133
+ chatbot=chatbot,
134
+ title="Doc-chat",
135
+ description="Ask about The little Prince!",
136
+ theme="soft",
137
+ examples=["Who is the little Prince?","What is the capital of France?"],
138
+ cache_examples=True,
139
+ retry_btn=None,
140
+ undo_btn="Delete Previous",
141
+ clear_btn="Clear",
142
+ )
143
+ chatbot.like(vote, [tmp, index_state], [tmp, index_state])
144
+
145
+ demo.launch()