File size: 6,271 Bytes
3098327
5287278
 
 
 
 
e0f824d
58bb824
5287278
 
 
 
fbbc2a0
eaf02ec
 
5287278
15814d6
 
 
 
 
 
 
 
 
3098327
15814d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e8a0ae
e7ad22b
15814d6
 
 
 
 
 
0a25c4f
23fe62d
0a25c4f
 
 
 
 
 
 
 
 
 
15814d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37e554b
15814d6
 
 
 
 
 
 
 
 
 
 
150d864
15814d6
 
 
 
 
 
 
 
eaf02ec
 
1e75ee2
15814d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7ad22b
 
15814d6
 
 
 
 
e7ad22b
 
15814d6
 
 
 
 
e7ad22b
 
15814d6
b608d50
 
 
1e75ee2
1601212
15814d6
ec7f498
37e554b
2729042
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import gradio as gr
import os
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
#from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.document_loaders import PyPDFLoader

cwd = os.getcwd() # current work dir
global vector_db
global qa_chain
def load_doc(list_file_path):
    loaders = [PyPDFLoader(x) for x in list_file_path]
    pages = []
    for loader in loaders:
        pages.extend(loader.load())
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024, chunk_overlap=64
    )
    doc_splits = text_splitter.split_documents(pages)
    return doc_splits

def create_db(splits):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(splits, embeddings)
    return vectordb

def initialize_chatbot(vector_db):
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    retriever = vector_db.as_retriever()
    llm = HuggingFaceEndpoint(
        repo_id="mistralai/Mistral-7B-Instruct-v0.2",
        huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN"),
        temperature=0.5,
        max_new_tokens=512,
        task="text-generation"  # Explicitly specify the task type
    )
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        verbose=False
    )
    return qa_chain

def process_and_initialize(files):
    if not files:
        return None, None, "Please upload a file first."
    
    try:
        list_file_path = [file.name for file in files if file is not None]
        list_file_path = [f"{cwd}/produk-aaa-00-intro.pdf", f"{cwd}/produk-aaa-01.pdf"]
        print(list_file_path)
        doc_splits = load_doc(list_file_path)
        db = create_db(doc_splits)
        qa = initialize_chatbot(db)
        return db, qa, "Database created! Ready for questions."
    except Exception as e:
        return None, None, f"Processing error: {str(e)}"
        
def process_dokumen():
    try:
        list_file_path = [f"{cwd}/produk-aaa-00-intro.pdf", f"{cwd}/produk-aaa-01.pdf"]
        print(list_file_path)
        doc_splits = load_doc(list_file_path)
        db = create_db(doc_splits)
        qa = initialize_chatbot(db)
        return db, qa, "Database created! Ready for questions."
    except Exception as e:
        return None, None, f"Processing error: {str(e)}"
        
def user_query_typing_effect(query, qa_chain, chatbot):
    history = chatbot or []
    try:
        response = qa_chain.invoke({"question": query, "chat_history": []})
        assistant_response = response["answer"]
        history.append({"role": "user", "content": query})
        history.append({"role": "assistant", "content": ""})
        for i in range(len(assistant_response)):
            history[-1]["content"] += assistant_response[i]
            yield history, ""
            time.sleep(0.03)
    except Exception as e:
        history.append({"role": "assistant", "content": f"Error: {str(e)}"})
        yield history, ""

def main():
    custom_css = """
    body {
        background-color: #FF8C00;
        font-family: Arial, sans-serif;
    }
    .gradio-container {
        border-radius: 15px;
        box-shadow: 0px 4px 20px rgba(0, 0, 0, 0.3);
        padding: 20px;
    }
    footer {
        /* visibility: hidden; */
    }
    .chatbot {
        border: 2px solid #000;
        border-radius: 10px;
        background-color: #FFF5E1;
    }
    """
    with gr.Blocks(css=custom_css) as app:
        vector_db = gr.State()
        qa_chain = gr.State()
        txt_file = [f"{cwd}/produk-aaa-00-intro.pdf", f"{cwd}/produk-aaa-01.pdf"]
        gr.Markdown("### 🌟 **PDF & TXT Chatbot** 🌟")
        gr.Markdown("#### Upload your document and ask questions interactively!")
        with gr.Row():
            with gr.Column(scale=1):
                txt_file = gr.Files(
                    label="πŸ“ Upload Documents",
                    file_types=[".txt", ".pdf"],
                    type="filepath"
                )
                analyze_btn = gr.Button("πŸš€ Process Documents")
                status = gr.Textbox(
                    label="πŸ“Š Status",
                    placeholder="Status updates will appear here...",
                    interactive=False
                )
            with gr.Column(scale=3):
                chatbot = gr.Chatbot(
                    label="πŸ€– Chat with your data",
                    height=600,
                    bubble_full_width=False,
                    show_label=False,
                    render_markdown=True,
                    type="messages",
                    elem_classes=["chatbot"]
                )
                query_input = gr.Textbox(
                    label="Ask a question",
                    placeholder="Ask about the document...",
                    show_label=False,
                    container=False
                )
                query_btn = gr.Button("Ask")
        analyze_btn.click(
            fn=process_and_initialize,
            inputs=[txt_file],
            outputs=[vector_db, qa_chain, status],
            show_progress="minimal",
            api_name="satu"
        )
        query_btn.click(
            fn=user_query_typing_effect,
            inputs=[query_input, qa_chain, chatbot],
            outputs=[chatbot, query_input],
            show_progress="minimal",
            api_name="dua"
        )
        query_input.submit(
            fn=user_query_typing_effect,
            inputs=[query_input, qa_chain, chatbot],
            outputs=[chatbot, query_input],
            show_progress="minimal",
            api_name="tiga"
        )
        def darso():
            vector_db, qa_chain, status = process_and_initialize([f"{cwd}/produk-aaa-00-intro.pdf", f"{cwd}/produk-aaa-01.pdf"])
            
    
    app.launch()

if __name__ == "__main__":
    main()