DocumentsChats / app.py
raz-135's picture
Update app.py
4ff1083 verified
import streamlit as st
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.vectorstores import Chroma
import tempfile
import os
from groq import Groq
# Initialize the Groq API client
client = Groq(api_key='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')
def get_groq_response(prompt, model="llama3-8b-8192"):
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model=model,
)
return chat_completion.choices[0].message.content
def process_file(uploaded_file):
# Save the uploaded file to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(uploaded_file.getvalue())
temp_file_path = temp_file.name
# Process the file based on its type
if uploaded_file.type == "application/pdf":
pdf_loader = PyPDFLoader(temp_file_path)
documents = pdf_loader.load()
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
word_loader = UnstructuredWordDocumentLoader(temp_file_path)
documents = word_loader.load()
elif uploaded_file.type == "text/plain":
text_loader = TextLoader(temp_file_path)
documents = text_loader.load()
else:
st.error("Unsupported file type.")
return None
# Clean up the temporary file
os.remove(temp_file_path)
return documents
def answer_with_retrieval(prompt, retriever):
context = retriever.get_relevant_documents(prompt)
context_text = " ".join([doc.page_content for doc in context])
combined_prompt = f"{context_text}\n\n{prompt}"
return get_groq_response(combined_prompt)
# Streamlit UI
st.title("Upload and Interact with File Content")
uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])
if uploaded_file:
# Process the uploaded file
documents = process_file(uploaded_file)
if documents:
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
chunked_documents = text_splitter.split_documents(documents)
# Ensure the chunked documents list is not empty
if not chunked_documents:
st.error("No content extracted from the document.")
else:
# Generate embeddings
HF_token = "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"
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_token, model_name="BAAI/bge-base-en-v1.5")
# Debug: Check the length of chunked_documents
st.write(f"Number of document chunks: {len(chunked_documents)}")
# Attempt to create vector store
try:
vectorstore = Chroma.from_documents(chunked_documents, embeddings)
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})
# User query
query = st.text_input("Enter your query:")
if query:
response = answer_with_retrieval(query, retriever)
st.write("### Response")
st.write(response)
except IndexError as ie:
st.error(f"IndexError during vector store creation: {str(ie)}")
except Exception as e:
st.error(f"Error creating vector store or generating embeddings: {str(e)}")