Spaces:

Ahmed12322
/

Pdf_reader

Sleeping

App Files Files Community

Pdf_reader / app.py

Ahmed12322

Update app.py

927fe6a verified about 1 year ago

raw

history blame contribute delete

5.78 kB

	import streamlit as st
	import faiss
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from groq import Groq
	import os
	import pypdf
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	# Initialize session state variables
	if "faiss_index" not in st.session_state:
	st.session_state["faiss_index"] = None
	if "chunks" not in st.session_state:
	st.session_state["chunks"] = []

	# Set Groq API key - Consider using st.secrets for better security
	GROQ_API_KEY = os.getenv("GROQ_API_KEY") or st.secrets.get("GROQ_API_KEY", "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")
	if not GROQ_API_KEY:
	st.error("⚠️ GROQ_API_KEY is missing! Please set it in your environment variables or secrets.toml file.")
	st.stop()

	# Load embedding model with error handling
	try:
	embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
	except Exception as e:
	st.error(f"❌ Failed to load embedding model: {str(e)}")
	st.stop()

	# Set up Groq client with error handling
	try:
	client = Groq(api_key=GROQ_API_KEY)
	except Exception as e:
	st.error(f"❌ Failed to initialize Groq client: {str(e)}")
	st.stop()

	# Function to extract text from PDF with error handling
	def extract_text_from_pdf(uploaded_file):
	try:
	reader = pypdf.PdfReader(uploaded_file)
	extracted_text = [page.extract_text() for page in reader.pages if page.extract_text()]
	return "\n".join(extracted_text) if extracted_text else ""
	except Exception as e:
	st.error(f"❌ Error extracting text from PDF: {str(e)}")
	return ""

	# Function to create text chunks
	def create_chunks(text, chunk_size=500, chunk_overlap=100):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	separators=["\n\n", "\n", " ", ""] # Added separators for better splitting
	)
	return text_splitter.split_text(text)

	# Function to create and save FAISS index
	def create_faiss_index(chunks):
	try:
	embeddings = embedding_model.encode(chunks, convert_to_numpy=True)

	# Create FAISS index
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings)

	return index, chunks
	except Exception as e:
	st.error(f"❌ Error creating FAISS index: {str(e)}")
	return None, []

	# Function to search FAISS
	def search_faiss(query, index, chunks, top_k=2):
	if index is None or not chunks:
	return []

	try:
	query_embedding = embedding_model.encode([query], convert_to_numpy=True)
	distances, indices = index.search(query_embedding, top_k)
	return [chunks[i] for i in indices[0] if i < len(chunks)]
	except Exception as e:
	st.error(f"❌ Search error: {str(e)}")
	return []

	# Function to query Groq with enhanced prompt
	def query_groq(query, context=None):
	try:
	prompt = f"""Use the following context to answer the question.
	If you don't know the answer, say you don't know. Don't make up answers.

	Context: {context if context else 'No specific context provided'}

	Question: {query}

	Answer:"""

	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": prompt}],
	model="llama-3-70b-8192", # Updated model name
	temperature=0.3,
	max_tokens=1024
	)
	return chat_completion.choices[0].message.content
	except Exception as e:
	return f"Error querying Groq: {str(e)}"

	# Streamlit UI
	st.set_page_config(page_title="RAG Chatbot", page_icon="🤖", layout="wide")
	st.title("📄 RAG-Based Chatbot with FAISS & Groq")

	# Sidebar for settings
	with st.sidebar:
	st.header("Settings")
	top_k = st.slider("Number of chunks to retrieve", 1, 5, 2)
	chunk_size = st.slider("Chunk size (characters)", 200, 1000, 500)
	chunk_overlap = st.slider("Chunk overlap (characters)", 0, 200, 100)

	# Upload PDF
	uploaded_file = st.file_uploader("📤 Upload a PDF file", type="pdf")

	if uploaded_file:
	with st.spinner("🔄 Processing PDF..."):
	text = extract_text_from_pdf(uploaded_file)
	if text.strip():
	chunks = create_chunks(text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)

	# Create FAISS index
	index, chunks = create_faiss_index(chunks)

	# Store in session state
	st.session_state["faiss_index"] = index
	st.session_state["chunks"] = chunks

	st.success(f"✅ PDF processed successfully! Created {len(chunks)} chunks.")
	else:
	st.error("❌ No text found in the uploaded PDF.")

	# Chat interface
	if "messages" not in st.session_state:
	st.session_state.messages = []

	# Display chat messages
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# User query input
	if prompt := st.chat_input("💬 Ask me something about the document:"):
	st.session_state.messages.append({"role": "user", "content": prompt})
	with st.chat_message("user"):
	st.markdown(prompt)

	with st.spinner("🔎 Retrieving response..."):
	retrieved_text = search_faiss(prompt, st.session_state["faiss_index"], st.session_state["chunks"], top_k=top_k)
	context = "\n".join(retrieved_text) if retrieved_text else "No relevant context found."

	response = query_groq(prompt, context)

	st.session_state.messages.append({"role": "assistant", "content": response})
	with st.chat_message("assistant"):
	st.markdown(response)