Spaces:

raz-135
/

DocumentsChats

Sleeping

App Files Files Community

DocumentsChats / app.py

raz-135

Update app.py

4ff1083 verified over 1 year ago

raw

history blame contribute delete

3.65 kB

	import streamlit as st
	from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader, TextLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
	from langchain.vectorstores import Chroma
	import tempfile
	import os
	from groq import Groq

	# Initialize the Groq API client
	client = Groq(api_key='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')

	def get_groq_response(prompt, model="llama3-8b-8192"):
	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": prompt}],
	model=model,
	)
	return chat_completion.choices[0].message.content

	def process_file(uploaded_file):
	# Save the uploaded file to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
	temp_file.write(uploaded_file.getvalue())
	temp_file_path = temp_file.name

	# Process the file based on its type
	if uploaded_file.type == "application/pdf":
	pdf_loader = PyPDFLoader(temp_file_path)
	documents = pdf_loader.load()
	elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	word_loader = UnstructuredWordDocumentLoader(temp_file_path)
	documents = word_loader.load()
	elif uploaded_file.type == "text/plain":
	text_loader = TextLoader(temp_file_path)
	documents = text_loader.load()
	else:
	st.error("Unsupported file type.")
	return None

	# Clean up the temporary file
	os.remove(temp_file_path)
	return documents

	def answer_with_retrieval(prompt, retriever):
	context = retriever.get_relevant_documents(prompt)
	context_text = " ".join([doc.page_content for doc in context])
	combined_prompt = f"{context_text}\n\n{prompt}"
	return get_groq_response(combined_prompt)

	# Streamlit UI
	st.title("Upload and Interact with File Content")

	uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])

	if uploaded_file:
	# Process the uploaded file
	documents = process_file(uploaded_file)

	if documents:
	# Split the documents into chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
	chunked_documents = text_splitter.split_documents(documents)

	# Ensure the chunked documents list is not empty
	if not chunked_documents:
	st.error("No content extracted from the document.")
	else:
	# Generate embeddings
	HF_token = "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"
	embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_token, model_name="BAAI/bge-base-en-v1.5")

	# Debug: Check the length of chunked_documents
	st.write(f"Number of document chunks: {len(chunked_documents)}")

	# Attempt to create vector store
	try:
	vectorstore = Chroma.from_documents(chunked_documents, embeddings)
	retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})

	# User query
	query = st.text_input("Enter your query:")

	if query:
	response = answer_with_retrieval(query, retriever)
	st.write("### Response")
	st.write(response)
	except IndexError as ie:
	st.error(f"IndexError during vector store creation: {str(ie)}")
	except Exception as e:
	st.error(f"Error creating vector store or generating embeddings: {str(e)}")