Spaces:

MohammadYaseen
/

Simple-RAG-Applicstion-Test

Sleeping

App Files Files Community

Simple-RAG-Applicstion-Test / app.py

MohammadYaseen

Create app.py

e6fb287 verified over 1 year ago

raw

history blame contribute delete

3.45 kB

	import os
	import pandas as pd
	import PyPDF2
	import docx
	from sentence_transformers import SentenceTransformer
	import faiss
	from groq import Groq
	import streamlit as st

	# Initialize Groq API Client
	client = Groq(api_key="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")

	# SentenceTransformer model for embeddings
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	# FAISS index for retrieval
	dimension = 384 # Dimension of 'all-MiniLM-L6-v2' embeddings
	index = faiss.IndexFlatL2(dimension)
	document_texts = [] # Store text corresponding to embeddings

	# Helper function: Extract text from different file types
	def extract_text_from_file(file):
	text = ""
	if file.name.endswith(".pdf"):
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	text += page.extract_text()
	elif file.name.endswith(".csv"):
	df = pd.read_csv(file)
	text = "\n".join([" ".join(map(str, row)) for row in df.values])
	elif file.name.endswith(".xlsx") or file.name.endswith(".xls"):
	df = pd.read_excel(file)
	text = "\n".join([" ".join(map(str, row)) for row in df.values])
	elif file.name.endswith(".txt"):
	text = file.read().decode("utf-8")
	elif file.name.endswith(".docx"):
	doc = docx.Document(file)
	text = "\n".join([p.text for p in doc.paragraphs])
	else:
	text = None
	return text

	# Add document embeddings to FAISS
	def add_to_index(text, index, document_texts):
	sentences = text.split("\n")
	embeddings = embedding_model.encode(sentences, convert_to_numpy=True)
	index.add(embeddings)
	document_texts.extend(sentences)

	# Perform RAG Query
	def rag_query(query, index, document_texts, top_k=3):
	"""
	Perform a RAG query: Retrieve relevant documents and generate a response.
	"""
	# Generate query embedding and retrieve closest matches
	query_embedding = embedding_model.encode([query], convert_to_numpy=True)
	distances, indices = index.search(query_embedding, top_k)

	# Build the context from retrieved documents
	retrieved_context = " ".join([document_texts[idx] for idx in indices[0]])

	# Construct the prompt for the Groq model
	prompt = f"Context: {retrieved_context}\n\nQuestion: {query}"

	# Generate a response using Groq API
	chat_completion = client.chat.completions.create(
	messages=[
	{"role": "user", "content": prompt}
	],
	model="gemma2-9b-it",
	)
	return chat_completion.choices[0].message.content

	# Streamlit UI
	st.title("RAG-Based Document Q&A")
	st.write("Upload your documents and ask questions based on the content.")

	uploaded_files = st.file_uploader(
	"Upload PDFs, CSVs, Excel, or Text files",
	type=["pdf", "csv", "xlsx", "xls", "txt", "docx"],
	accept_multiple_files=True,
	)

	if uploaded_files:
	for file in uploaded_files:
	with st.spinner(f"Processing {file.name}..."):
	text = extract_text_from_file(file)
	if text:
	add_to_index(text, index, document_texts)
	st.success(f"Processed {file.name}")
	else:
	st.error(f"Could not process {file.name}. Unsupported file format.")

	query = st.text_input("Enter your question:")
	if query:
	with st.spinner("Generating response..."):
	response = rag_query(query, index, document_texts)
	st.write("### Answer:")
	st.write(response)