import streamlit as st from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader, TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings from langchain.vectorstores import Chroma import tempfile import os from groq import Groq # Initialize the Groq API client client = Groq(api_key='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') def get_groq_response(prompt, model="llama3-8b-8192"): chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model=model, ) return chat_completion.choices[0].message.content def process_file(uploaded_file): # Save the uploaded file to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: temp_file.write(uploaded_file.getvalue()) temp_file_path = temp_file.name # Process the file based on its type if uploaded_file.type == "application/pdf": pdf_loader = PyPDFLoader(temp_file_path) documents = pdf_loader.load() elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": word_loader = UnstructuredWordDocumentLoader(temp_file_path) documents = word_loader.load() elif uploaded_file.type == "text/plain": text_loader = TextLoader(temp_file_path) documents = text_loader.load() else: st.error("Unsupported file type.") return None # Clean up the temporary file os.remove(temp_file_path) return documents def answer_with_retrieval(prompt, retriever): context = retriever.get_relevant_documents(prompt) context_text = " ".join([doc.page_content for doc in context]) combined_prompt = f"{context_text}\n\n{prompt}" return get_groq_response(combined_prompt) # Streamlit UI st.title("Upload and Interact with File Content") uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"]) if uploaded_file: # Process the uploaded file documents = process_file(uploaded_file) if documents: # Split the documents into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50) chunked_documents = text_splitter.split_documents(documents) # Ensure the chunked documents list is not empty if not chunked_documents: st.error("No content extracted from the document.") else: # Generate embeddings HF_token = "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" embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_token, model_name="BAAI/bge-base-en-v1.5") # Debug: Check the length of chunked_documents st.write(f"Number of document chunks: {len(chunked_documents)}") # Attempt to create vector store try: vectorstore = Chroma.from_documents(chunked_documents, embeddings) retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3}) # User query query = st.text_input("Enter your query:") if query: response = answer_with_retrieval(query, retriever) st.write("### Response") st.write(response) except IndexError as ie: st.error(f"IndexError during vector store creation: {str(ie)}") except Exception as e: st.error(f"Error creating vector store or generating embeddings: {str(e)}")