File size: 4,653 Bytes
fd9ca75
 
 
 
 
 
 
 
 
 
 
 
 
 
837390a
d9eb9d8
fd9ca75
 
 
 
 
09e12a6
ccd1325
fd9ca75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
800d5f0
 
fd9ca75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71a8ec9
fd9ca75
 
 
 
 
 
 
 
 
 
 
 
 
28f8ec9
a53bc7d
277e13c
a53bc7d
28f8ec9
8a087e4
277e13c
 
 
03676b1
277e13c
 
cdf1ed1
28f8ec9
a53bc7d
fd9ca75
 
 
 
 
ac49d87
fd9ca75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe5933c
fd9ca75
fe5933c
fd9ca75
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# -*- coding: utf-8 -*-
"""Doc_chat_vegleges_like.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Igjhvd8GhC8qJf7syPEa2x0KKjroy7KV

# Setting up environment
"""

from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import ElasticVectorSearch, Pinecone, Weaviate
from langchain_community.vectorstores import FAISS

# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os

print(os.environ["OPENAI_API_KEY"])
print(os.environ["DATASET_ACCES"],'HALOOO')
"""# Preprocessing document"""

# location of the pdf file/files.
reader = PdfReader('samu-en-567.pdf')
#reader = PdfReader('/content/WOW.pdf')
#reader = PdfReader('/content/the_little_prince.pdf')
#reader = PdfReader('/content/constitution.pdf')

# read data from the file
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 150,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

len(texts)

"""## Setting up doc search"""

embeddings = OpenAIEmbeddings()
doc_search = FAISS.from_texts(texts, embeddings)

"""# Setting up chatbot"""

from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferWindowMemory
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI

template = """You are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer based on the document ONLY and NOTHING else.
Any questions outside of the document is irrelevant and you certanly dont know! If You cannot find the answer say "The document does not contain that information."

{context}

{chat_history}
Human: {human_input}
Chatbot:"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input", "context"], template=template
)

memory = ConversationBufferWindowMemory(memory_key="chat_history", input_key="human_input",k=3)
chain = load_qa_chain( OpenAI(), chain_type="stuff", memory=memory, prompt=prompt)

"""# Demo

## Setting up methods
"""

def chat(query,history):
  docs = doc_search.similarity_search(query)
  return chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text']

"""## Setting up UI with gradio"""

import gradio as gr
from huggingface_hub import HfFileSystem

fs = HfFileSystem(token=os.environ.get('DATASET_ACCES'))

def write_to_file(file_name,content):
    file_path = f"datasets/mgreg555/samu_reference_book/" + file_name
    with fs.open(file_path, "r") as file_old:
        content_old = file_old.read()
        print(content_old)

    with fs.open(file_path, "w") as file:
        file.write(f"{content_old}\n" + content)

# Example usage
        

def vote(tmp, index_state, data: gr.LikeData):
    value_new = data.value
    index_new = data.index
    file_name = 'good.txt' if data.liked else 'bad.txt'
    write_to_file(file_name, value_new + ';' + find_previous_question(value_new))

def find_previous_question(answer_string):
    lines = chain.memory.buffer.split('\n')
    last_question = None
    current_question = None

    for line in lines:
        if line.startswith('Human:'):
            current_question = line[7:].strip() # Extract the question by removing the 'Human:' prefix
        elif line.startswith('AI:') and line[3:].strip() == answer_string:
            return current_question # Return the previous question when the answer is found

    return None

chatbot = gr.Chatbot(height=600, likeable=True)

# Use gradio.Blocks to create a context for your components and event listeners
with gr.Blocks() as demo:
    index_state = gr.State(value=[])
    tmp = gr.Textbox(visible=False, value="")
    gr.ChatInterface(
        chat,
        chatbot=chatbot,
        title="Doc-chat",
        description="Ask about SAMU!",
        theme="soft",
        examples=["What is SAMU?","What is the capital of France?"],
        cache_examples=True,
        retry_btn=None,
        undo_btn="Delete Previous",
        clear_btn="Clear",
    )
    chatbot.like(vote, [tmp, index_state], [tmp, index_state])

demo.launch()