File size: 5,056 Bytes
f46b9c4 cf2c2a3 f46b9c4 cf2c2a3 f46b9c4 cf2c2a3 f46b9c4 cf2c2a3 f46b9c4 0279ba5 cf2c2a3 0279ba5 cf2c2a3 0279ba5 f46b9c4 cf2c2a3 f46b9c4 cf2c2a3 f46b9c4 cf2c2a3 f46b9c4 cf2c2a3 f46b9c4 cf2c2a3 f46b9c4 cf2c2a3 f46b9c4 cf2c2a3 f46b9c4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | import streamlit as st
from llama_index.core import StorageContext, load_index_from_storage, VectorStoreIndex, SimpleDirectoryReader, ChatPromptTemplate
from llama_index.llms.huggingface import HuggingFaceInferenceAPI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from dotenv import load_dotenv
import os
import base64
import docx2txt
# Load environment variables
load_dotenv()
icons = {"assistant": "robot.png", "user": "man-kddi.png"}
# Configure the Llama index settings for the Google/Gemma-7B-IT model and English embedding
Settings.llm = HuggingFaceInferenceAPI(
model_name="google/gemma-7b-it",
tokenizer_name="google/gemma-7b-it",
context_window=3900,
token=os.getenv("HF_TOKEN"),
max_new_tokens=1000,
generate_kwargs={"temperature": 0.5},
)
Settings.embed_model = HuggingFaceEmbedding(
model_name="BAAI/bge-small-en-v1.5" # English embedding model; adjust if an Italian model is available
)
# Define the directory for persistent storage and data
PERSIST_DIR = "./db"
DATA_DIR = "data"
# Ensure data directory exists
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(PERSIST_DIR, exist_ok=True)
def displayPDF(file):
with open(file, "rb") as f:
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
st.markdown(pdf_display, unsafe_allow_html=True)
def displayDOCX(file):
text = docx2txt.process(file)
st.text_area("Document Content", text, height=400)
def displayTXT(file):
with open(file, "r") as f:
text = f.read()
st.text_area("Document Content", text, height=400)
def data_ingestion():
documents = SimpleDirectoryReader(DATA_DIR).load_data()
storage_context = StorageContext.from_defaults()
index = VectorStoreIndex.from_documents(documents)
index.storage_context.persist(persist_dir=PERSIST_DIR)
def handle_query(query):
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = load_index_from_storage(storage_context)
chat_text_qa_msgs = [
(
"<start_of_turn>user"
'''You are a Q&A assistant named CHAT-DOC. Your main goal is to provide answers as accurately as possible, based on the instructions and context given to you. If a question does not match the provided context or is outside the scope of the document, kindly advise the user to ask questions within the context of the document.
Context:
{context_str}
Question:
{query_str}
<end_of_turn>'''
)
]
text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
query_engine = index.as_query_engine(text_qa_template=text_qa_template)
answer = query_engine.query(query)
if hasattr(answer, 'response'):
return answer.response
elif isinstance(answer, dict) and 'response' in answer:
return answer['response']
else:
return "Sorry, I couldn't find an answer."
# Streamlit app initialization
st.title("Chat with Your Document 📄")
st.markdown("Chat here👇")
if 'messages' not in st.session_state:
st.session_state.messages = [{'role': 'assistant', "content": 'Hello! Upload a PDF, DOCX, or TXT file and ask me anything about its content.'}]
for message in st.session_state.messages:
with st.chat_message(message['role'], avatar=icons[message['role']]):
st.write(message['content'])
with st.sidebar:
st.title("Menu:")
uploaded_file = st.file_uploader("Upload your document (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"])
if st.button("Submit & Process") and uploaded_file:
with st.spinner("Processing..."):
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
filepath = os.path.join(DATA_DIR, "uploaded_file" + file_extension)
with open(filepath, "wb") as f:
f.write(uploaded_file.getbuffer())
if file_extension == ".pdf":
displayPDF(filepath)
elif file_extension == ".docx":
displayDOCX(filepath)
elif file_extension == ".txt":
displayTXT(filepath)
data_ingestion() # Process file every time a new file is uploaded
st.success("Done")
user_prompt = st.text_input("Ask me anything about the content of the document:")
if user_prompt and uploaded_file:
st.session_state.messages.append({'role': 'user', "content": user_prompt})
with st.chat_message("user", avatar=icons["user"]):
st.write(user_prompt)
# Trigger assistant's response retrieval and update UI
with st.spinner("Thinking..."):
response = handle_query(user_prompt)
with st.chat_message("assistant", avatar=icons["assistant"]):
st.write(response)
st.session_state.messages.append({'role': 'assistant', "content": response})
|