Spaces:

raz-135
/

DocumentsChats

Sleeping

File size: 3,646 Bytes

import streamlit as st
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.vectorstores import Chroma
import tempfile
import os
from groq import Groq

# Initialize the Groq API client
client = Groq(api_key='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')

def get_groq_response(prompt, model="llama3-8b-8192"):
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model=model,
    )
    return chat_completion.choices[0].message.content

def process_file(uploaded_file):
    # Save the uploaded file to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
        temp_file.write(uploaded_file.getvalue())
        temp_file_path = temp_file.name

    # Process the file based on its type
    if uploaded_file.type == "application/pdf":
        pdf_loader = PyPDFLoader(temp_file_path)
        documents = pdf_loader.load()
    elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        word_loader = UnstructuredWordDocumentLoader(temp_file_path)
        documents = word_loader.load()
    elif uploaded_file.type == "text/plain":
        text_loader = TextLoader(temp_file_path)
        documents = text_loader.load()
    else:
        st.error("Unsupported file type.")
        return None

    # Clean up the temporary file
    os.remove(temp_file_path)
    return documents

def answer_with_retrieval(prompt, retriever):
    context = retriever.get_relevant_documents(prompt)
    context_text = " ".join([doc.page_content for doc in context])
    combined_prompt = f"{context_text}\n\n{prompt}"
    return get_groq_response(combined_prompt)

# Streamlit UI
st.title("Upload and Interact with File Content")

uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])

if uploaded_file:
    # Process the uploaded file
    documents = process_file(uploaded_file)
    
    if documents:
        # Split the documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
        chunked_documents = text_splitter.split_documents(documents)

        # Ensure the chunked documents list is not empty
        if not chunked_documents:
            st.error("No content extracted from the document.")
        else:
            # Generate embeddings
            HF_token = "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"
            embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_token, model_name="BAAI/bge-base-en-v1.5")

            # Debug: Check the length of chunked_documents
            st.write(f"Number of document chunks: {len(chunked_documents)}")

            # Attempt to create vector store
            try:
                vectorstore = Chroma.from_documents(chunked_documents, embeddings)
                retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})

                # User query
                query = st.text_input("Enter your query:")

                if query:
                    response = answer_with_retrieval(query, retriever)
                    st.write("### Response")
                    st.write(response)
            except IndexError as ie:
                st.error(f"IndexError during vector store creation: {str(ie)}")
            except Exception as e:
                st.error(f"Error creating vector store or generating embeddings: {str(e)}")