File size: 5,056 Bytes
f46b9c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf2c2a3
f46b9c4
 
 
 
 
 
 
 
 
 
cf2c2a3
f46b9c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf2c2a3
f46b9c4
 
 
 
cf2c2a3
f46b9c4
 
 
 
 
 
 
 
 
 
 
 
0279ba5
 
 
cf2c2a3
0279ba5
 
cf2c2a3
0279ba5
 
 
f46b9c4
 
 
 
 
 
 
 
 
 
 
cf2c2a3
f46b9c4
 
cf2c2a3
 
f46b9c4
 
cf2c2a3
f46b9c4
 
 
 
 
 
 
cf2c2a3
 
 
f46b9c4
 
 
 
 
 
 
 
 
 
 
 
 
cf2c2a3
f46b9c4
cf2c2a3
f46b9c4
 
 
 
 
 
 
cf2c2a3
f46b9c4
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import streamlit as st
from llama_index.core import StorageContext, load_index_from_storage, VectorStoreIndex, SimpleDirectoryReader, ChatPromptTemplate
from llama_index.llms.huggingface import HuggingFaceInferenceAPI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from dotenv import load_dotenv
import os
import base64
import docx2txt

# Load environment variables
load_dotenv()

icons = {"assistant": "robot.png", "user": "man-kddi.png"}

# Configure the Llama index settings for the Google/Gemma-7B-IT model and English embedding
Settings.llm = HuggingFaceInferenceAPI(
    model_name="google/gemma-7b-it",
    tokenizer_name="google/gemma-7b-it",
    context_window=3900,
    token=os.getenv("HF_TOKEN"),
    max_new_tokens=1000,
    generate_kwargs={"temperature": 0.5},
)

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"  # English embedding model; adjust if an Italian model is available
)

# Define the directory for persistent storage and data
PERSIST_DIR = "./db"
DATA_DIR = "data"

# Ensure data directory exists
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(PERSIST_DIR, exist_ok=True)

def displayPDF(file):
    with open(file, "rb") as f:
        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
    st.markdown(pdf_display, unsafe_allow_html=True)

def displayDOCX(file):
    text = docx2txt.process(file)
    st.text_area("Document Content", text, height=400)

def displayTXT(file):
    with open(file, "r") as f:
        text = f.read()
    st.text_area("Document Content", text, height=400)

def data_ingestion():
    documents = SimpleDirectoryReader(DATA_DIR).load_data()
    storage_context = StorageContext.from_defaults()
    index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist(persist_dir=PERSIST_DIR)

def handle_query(query):
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
    chat_text_qa_msgs = [
        (
            "<start_of_turn>user"
            '''You are a Q&A assistant named CHAT-DOC. Your main goal is to provide answers as accurately as possible, based on the instructions and context given to you. If a question does not match the provided context or is outside the scope of the document, kindly advise the user to ask questions within the context of the document.

            Context:
                {context_str}

            Question:
                {query_str}
            <end_of_turn>'''

        )
    ]
    text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
    query_engine = index.as_query_engine(text_qa_template=text_qa_template)
    answer = query_engine.query(query)
    
    if hasattr(answer, 'response'):
        return answer.response
    elif isinstance(answer, dict) and 'response' in answer:
        return answer['response']
    else:
        return "Sorry, I couldn't find an answer."

# Streamlit app initialization
st.title("Chat with Your Document 📄")
st.markdown("Chat here👇")

if 'messages' not in st.session_state:
    st.session_state.messages = [{'role': 'assistant', "content": 'Hello! Upload a PDF, DOCX, or TXT file and ask me anything about its content.'}]

for message in st.session_state.messages:
    with st.chat_message(message['role'], avatar=icons[message['role']]):
        st.write(message['content'])

with st.sidebar:
    st.title("Menu:")
    uploaded_file = st.file_uploader("Upload your document (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"])
    if st.button("Submit & Process") and uploaded_file:
        with st.spinner("Processing..."):
            file_extension = os.path.splitext(uploaded_file.name)[1].lower()
            filepath = os.path.join(DATA_DIR, "uploaded_file" + file_extension)
            with open(filepath, "wb") as f:
                f.write(uploaded_file.getbuffer())
            
            if file_extension == ".pdf":
                displayPDF(filepath)
            elif file_extension == ".docx":
                displayDOCX(filepath)
            elif file_extension == ".txt":
                displayTXT(filepath)
            
            data_ingestion()  # Process file every time a new file is uploaded
            st.success("Done")

user_prompt = st.text_input("Ask me anything about the content of the document:")

if user_prompt and uploaded_file:
    st.session_state.messages.append({'role': 'user', "content": user_prompt})
    with st.chat_message("user", avatar=icons["user"]):
        st.write(user_prompt)

    # Trigger assistant's response retrieval and update UI
    with st.spinner("Thinking..."):
        response = handle_query(user_prompt)
        with st.chat_message("assistant", avatar=icons["assistant"]):
            st.write(response)
        st.session_state.messages.append({'role': 'assistant', "content": response})