File size: 4,395 Bytes
73635cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
793fe1b
73635cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aef8e61
73635cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccd64f2
73635cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
793fe1b
73635cb
793fe1b
73635cb
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
"""Doc_chat_vegleges_like.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Igjhvd8GhC8qJf7syPEa2x0KKjroy7KV

# Setting up environment
"""

from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import ElasticVectorSearch, Pinecone, Weaviate
from langchain_community.vectorstores import FAISS

# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os

print(os.environ["OPENAI_API_KEY"])

"""# Preprocessing document"""

# location of the pdf file/files.
reader = PdfReader('The_Little_Prince.pdf')

# read data from the file
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 150,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

len(texts)

"""## Setting up doc search"""

embeddings = OpenAIEmbeddings()
doc_search = FAISS.from_texts(texts, embeddings)

"""# Setting up chatbot"""

from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferWindowMemory
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI

template = """You are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer based on the document ONLY and NOTHING else.
If You cannot find the answer say "The document does not contain that information."

{context}

{chat_history}
Human: {human_input}
Chatbot:"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input", "context"], template=template
)

memory = ConversationBufferWindowMemory(memory_key="chat_history", input_key="human_input",k=3)
chain = load_qa_chain( OpenAI(), chain_type="stuff", memory=memory, prompt=prompt)

"""# Demo

## Setting up methods
"""

def chat(query,history):
  docs = doc_search.similarity_search(query)
  return chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text']

"""## Setting up UI with gradio"""

import gradio as gr
from huggingface_hub import HfFileSystem

fs = HfFileSystem(token=os.environ.get('DATASET_ACCES'))

def write_to_file(file_name,content):
    file_path = f"datasets/mgreg555/Little_Prince/" + file_name
    with fs.open(file_path, "r") as file_old:
        content_old = file_old.read()
        print(content_old)

    with fs.open(file_path, "w") as file:
        file.write(f"{content_old}\n" + content)
        

def vote(tmp, index_state, data: gr.LikeData):
    value_new = data.value
    index_new = data.index
    file_name = 'good.txt' if data.liked else 'bad.txt'
    write_to_file(file_name, value_new + ';' + find_previous_question(value_new))

def find_previous_question(answer_string):
    lines = chain.memory.buffer.split('\n')
    last_question = None
    current_question = None

    for line in lines:
        if line.startswith('Human:'):
            current_question = line[7:].strip() # Extract the question by removing the 'Human:' prefix
        elif line.startswith('AI:') and line[3:].strip() == answer_string:
            return current_question # Return the previous question when the answer is found

    return None

chatbot = gr.Chatbot(height=600, likeable=True)

# Use gradio.Blocks to create a context for your components and event listeners
with gr.Blocks() as demo:
    index_state = gr.State(value=[])
    tmp = gr.Textbox(visible=False, value="")
    gr.ChatInterface(
        chat,
        chatbot=chatbot,
        title="Doc-chat",
        description="Ask about The Little Prince!",
        theme="soft",
        examples=["Who is the Little Prince?","What is the capital of France?"],
        cache_examples=True,
        retry_btn=None,
        undo_btn="Delete Previous",
        clear_btn="Clear",
    )
    chatbot.like(vote, [tmp, index_state], [tmp, index_state])

demo.launch()