Spaces:

MohammadYaseen
/

Rag-Sample

Sleeping

App Files Files Community

Rag-Sample / app.py

MohammadYaseen

Create app.py

4184e11 verified about 1 year ago

raw

history blame contribute delete

7.54 kB

	import os
	import pandas as pd
	import PyPDF2
	import docx
	from sentence_transformers import SentenceTransformer
	import faiss
	import streamlit as st
	import time
	from groq import Groq
	import re

	# Initialize embedding model
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	# FAISS setup
	dimension = 384 # Dimension of 'all-MiniLM-L6-v2' embeddings
	index = faiss.IndexFlatL2(dimension)
	document_texts = [] # Store text corresponding to embeddings

	# Constants for file handling
	MAX_FILE_SIZE_MB = 100 # 100 MB
	MAX_NUM_FILES = 5
	MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024

	# Set up the Groq API client directly with your API key
	api_key = "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" # Replace with your actual Groq API key
	client = Groq(api_key=api_key)

	# Function to get human-readable file size
	def get_human_readable_size(size_in_bytes):
	if size_in_bytes < 1024:
	return f"{size_in_bytes} Bytes"
	elif size_in_bytes < 1024 ** 2:
	return f"{size_in_bytes / 1024:.2f} KB"
	elif size_in_bytes < 1024 ** 3:
	return f"{size_in_bytes / (1024 ** 2):.2f} MB"
	else:
	return f"{size_in_bytes / (1024 ** 3):.2f} GB"

	# Function to extract text from uploaded files
	def extract_text_from_file(file):
	text = ""
	if file.name.endswith(".pdf"):
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	text += page.extract_text()
	elif file.name.endswith(".csv"):
	df = pd.read_csv(file)
	text = "\n".join([" ".join(map(str, row)) for row in df.values])
	elif file.name.endswith(".xlsx") or file.name.endswith(".xls"):
	df = pd.read_excel(file)
	text = "\n".join([" ".join(map(str, row)) for row in df.values])
	elif file.name.endswith(".txt"):
	text = file.read().decode("utf-8")
	elif file.name.endswith(".docx"):
	doc = docx.Document(file)
	text = "\n".join([p.text for p in doc.paragraphs])
	else:
	text = None
	return text

	# Function to split large text into smaller chunks
	def split_text_into_chunks(text, max_chunk_size=500):
	sentences = text.split(". ")
	chunks = []
	chunk = []
	current_size = 0
	for sentence in sentences:
	sentence_size = len(sentence)
	if current_size + sentence_size <= max_chunk_size:
	chunk.append(sentence)
	current_size += sentence_size
	else:
	chunks.append(". ".join(chunk))
	chunk = [sentence]
	current_size = sentence_size
	if chunk:
	chunks.append(". ".join(chunk))
	return chunks

	# Function to add document text to FAISS index
	def add_to_index(text, index, document_texts):
	chunks = split_text_into_chunks(text)
	embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
	index.add(embeddings)
	document_texts.extend(chunks)

	# Function to generate pre-questions based on the document
	def suggest_questions(text):
	# Example simple questions based on content type
	if len(text.split()) < 200:
	return [
	"Can you summarize the main points?",
	"What is the main argument or conclusion?",
	"What is the purpose of this document?"
	]
	else:
	return [
	"What are the key takeaways from this document?",
	"Can you provide a summary of the main sections?",
	"What are the major findings or conclusions?"
	]

	# Function to generate answer using Groq
	def generate_answer_with_groq(question, context):
	# Sending user input question to Groq for response
	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": f"Context: {context}\nQuestion: {question}"}],
	model="gemma2-9b-it",
	)
	return chat_completion.choices[0].message.content

	# Function to validate user input (basic check for valid text)
	def is_valid_input(query):
	# Check if the input contains only alphabetic characters, spaces, or common punctuation
	# This heuristic helps detect typing errors or nonsensical queries
	query = query.strip()
	if not query:
	return False # Empty input is invalid
	# Regex to allow letters, spaces, and common punctuation
	pattern = r"^[A-Za-z0-9\s.,!?'-]*$"
	if re.match(pattern, query):
	return True
	return False

	# Handling user feedback
	def handle_feedback(feedback):
	if feedback:
	st.write("Thank you for your feedback!")

	# Streamlit UI
	st.title("Enhanced Document Q&A with RAG")
	st.sidebar.title("Tips for Better Experience")
	st.sidebar.write("""
	1. Maximum file size: 100 MB per file.
	2. You can upload up to 5 files at a time.
	3. Larger files may take longer to process.
	4. Please break large files into smaller chunks if necessary.
	5. Use the pre-generated questions to guide your inquiry.
	""")

	feedback = st.sidebar.text_area("Provide feedback to improve your experience:")

	# File uploader
	uploaded_files = st.file_uploader(
	"Upload documents (PDF, CSV, Excel, TXT, DOCX). Max size: 100 MB each.",
	type=["pdf", "csv", "xlsx", "xls", "txt", "docx"],
	accept_multiple_files=True,
	)

	if uploaded_files:
	if len(uploaded_files) > MAX_NUM_FILES:
	st.error(f"Maximum {MAX_NUM_FILES} files can be uploaded at a time.")
	else:
	for file in uploaded_files:
	file_size = file.size
	human_readable_size = get_human_readable_size(file_size)
	st.write(f"File: {file.name} \| Size: {human_readable_size}")
	if file_size > MAX_FILE_SIZE_BYTES:
	st.warning(
	f"File '{file.name}' exceeds the {MAX_FILE_SIZE_MB} MB limit. "
	"We will automatically break this file into smaller chunks."
	)
	with st.spinner(f"Processing {file.name}..."):
	text = extract_text_from_file(file)
	if text:
	# Automatically break large file into chunks
	chunks = split_text_into_chunks(text)
	add_to_index(" ".join(chunks), index, document_texts)
	st.success(f"Processed {file.name}")
	else:
	st.error(f"Could not process {file.name}. Unsupported format.")
	else:
	st.warning("No documents uploaded yet. Please upload documents before asking questions.")

	# Display user feedback handling
	if feedback:
	handle_feedback(feedback)

	# Input for question
	query = st.text_input("Enter your question:")

	# If query is entered and documents are uploaded
	if query:
	if not document_texts:
	st.warning("Please upload and process documents before asking questions.")
	elif not is_valid_input(query):
	st.error("Please ask a relevant question.")
	else:
	# Use Groq to generate a response based on uploaded documents
	with st.spinner("Generating response..."):
	response = generate_answer_with_groq(query, " ".join(document_texts))
	st.write("### Answer:")
	st.write(response)

	st.write("### Suggested Questions:")
	questions = suggest_questions(" ".join(document_texts)) # Generate based on full document content
	for question in questions:
	st.write(f"- {question}")

	# Instructions and reminders if not uploaded_files:
	if not uploaded_files:
	st.info("You haven't uploaded any documents yet. Please upload documents to start.")
	else:
	st.info("Enter a question to ask about the uploaded documents.")