Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader, TextLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| import tempfile | |
| import os | |
| from groq import Groq | |
| # Initialize the Groq API client | |
| client = Groq(api_key='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') | |
| def get_groq_response(prompt, model="llama3-8b-8192"): | |
| chat_completion = client.chat.completions.create( | |
| messages=[{"role": "user", "content": prompt}], | |
| model=model, | |
| ) | |
| return chat_completion.choices[0].message.content | |
| def process_file(uploaded_file): | |
| # Save the uploaded file to a temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: | |
| temp_file.write(uploaded_file.getvalue()) | |
| temp_file_path = temp_file.name | |
| # Process the file based on its type | |
| if uploaded_file.type == "application/pdf": | |
| pdf_loader = PyPDFLoader(temp_file_path) | |
| documents = pdf_loader.load() | |
| elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
| word_loader = UnstructuredWordDocumentLoader(temp_file_path) | |
| documents = word_loader.load() | |
| elif uploaded_file.type == "text/plain": | |
| text_loader = TextLoader(temp_file_path) | |
| documents = text_loader.load() | |
| else: | |
| st.error("Unsupported file type.") | |
| return None | |
| # Clean up the temporary file | |
| os.remove(temp_file_path) | |
| return documents | |
| def answer_with_retrieval(prompt, retriever): | |
| context = retriever.get_relevant_documents(prompt) | |
| context_text = " ".join([doc.page_content for doc in context]) | |
| combined_prompt = f"{context_text}\n\n{prompt}" | |
| return get_groq_response(combined_prompt) | |
| # Streamlit UI | |
| st.title("Upload and Interact with File Content") | |
| uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"]) | |
| if uploaded_file: | |
| # Process the uploaded file | |
| documents = process_file(uploaded_file) | |
| if documents: | |
| # Split the documents into chunks | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50) | |
| chunked_documents = text_splitter.split_documents(documents) | |
| # Ensure the chunked documents list is not empty | |
| if not chunked_documents: | |
| st.error("No content extracted from the document.") | |
| else: | |
| # Generate embeddings | |
| HF_token = "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" | |
| embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_token, model_name="BAAI/bge-base-en-v1.5") | |
| # Debug: Check the length of chunked_documents | |
| st.write(f"Number of document chunks: {len(chunked_documents)}") | |
| # Attempt to create vector store | |
| try: | |
| vectorstore = Chroma.from_documents(chunked_documents, embeddings) | |
| retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3}) | |
| # User query | |
| query = st.text_input("Enter your query:") | |
| if query: | |
| response = answer_with_retrieval(query, retriever) | |
| st.write("### Response") | |
| st.write(response) | |
| except IndexError as ie: | |
| st.error(f"IndexError during vector store creation: {str(ie)}") | |
| except Exception as e: | |
| st.error(f"Error creating vector store or generating embeddings: {str(e)}") | |