Spaces:

Ahmed12322
/

Pdf_reader

Sleeping

File size: 5,776 Bytes

927fe6a

import streamlit as st
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from groq import Groq
import os
import pypdf
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize session state variables
if "faiss_index" not in st.session_state:
    st.session_state["faiss_index"] = None
if "chunks" not in st.session_state:
    st.session_state["chunks"] = []

# Set Groq API key - Consider using st.secrets for better security
GROQ_API_KEY = os.getenv("GROQ_API_KEY") or st.secrets.get("GROQ_API_KEY", "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")
if not GROQ_API_KEY:
    st.error("⚠️ GROQ_API_KEY is missing! Please set it in your environment variables or secrets.toml file.")
    st.stop()

# Load embedding model with error handling
try:
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
except Exception as e:
    st.error(f"❌ Failed to load embedding model: {str(e)}")
    st.stop()

# Set up Groq client with error handling
try:
    client = Groq(api_key=GROQ_API_KEY)
except Exception as e:
    st.error(f"❌ Failed to initialize Groq client: {str(e)}")
    st.stop()

# Function to extract text from PDF with error handling
def extract_text_from_pdf(uploaded_file):
    try:
        reader = pypdf.PdfReader(uploaded_file)
        extracted_text = [page.extract_text() for page in reader.pages if page.extract_text()]
        return "\n".join(extracted_text) if extracted_text else ""
    except Exception as e:
        st.error(f"❌ Error extracting text from PDF: {str(e)}")
        return ""

# Function to create text chunks
def create_chunks(text, chunk_size=500, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]  # Added separators for better splitting
    )
    return text_splitter.split_text(text)

# Function to create and save FAISS index
def create_faiss_index(chunks):
    try:
        embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
        
        # Create FAISS index
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)
        
        return index, chunks
    except Exception as e:
        st.error(f"❌ Error creating FAISS index: {str(e)}")
        return None, []

# Function to search FAISS
def search_faiss(query, index, chunks, top_k=2):
    if index is None or not chunks:
        return []
    
    try:
        query_embedding = embedding_model.encode([query], convert_to_numpy=True)
        distances, indices = index.search(query_embedding, top_k)
        return [chunks[i] for i in indices[0] if i < len(chunks)]
    except Exception as e:
        st.error(f"❌ Search error: {str(e)}")
        return []

# Function to query Groq with enhanced prompt
def query_groq(query, context=None):
    try:
        prompt = f"""Use the following context to answer the question. 
        If you don't know the answer, say you don't know. Don't make up answers.
        
        Context: {context if context else 'No specific context provided'}
        
        Question: {query}
        
        Answer:"""
        
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="llama-3-70b-8192",  # Updated model name
            temperature=0.3,
            max_tokens=1024
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Error querying Groq: {str(e)}"

# Streamlit UI
st.set_page_config(page_title="RAG Chatbot", page_icon="🤖", layout="wide")
st.title("📄 RAG-Based Chatbot with FAISS & Groq")

# Sidebar for settings
with st.sidebar:
    st.header("Settings")
    top_k = st.slider("Number of chunks to retrieve", 1, 5, 2)
    chunk_size = st.slider("Chunk size (characters)", 200, 1000, 500)
    chunk_overlap = st.slider("Chunk overlap (characters)", 0, 200, 100)

# Upload PDF
uploaded_file = st.file_uploader("📤 Upload a PDF file", type="pdf")

if uploaded_file:
    with st.spinner("🔄 Processing PDF..."):
        text = extract_text_from_pdf(uploaded_file)
        if text.strip():
            chunks = create_chunks(text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
            
            # Create FAISS index
            index, chunks = create_faiss_index(chunks)
            
            # Store in session state
            st.session_state["faiss_index"] = index
            st.session_state["chunks"] = chunks
            
            st.success(f"✅ PDF processed successfully! Created {len(chunks)} chunks.")
        else:
            st.error("❌ No text found in the uploaded PDF.")

# Chat interface
if "messages" not in st.session_state:
    st.session_state.messages = []

# Display chat messages
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# User query input
if prompt := st.chat_input("💬 Ask me something about the document:"):
    st.session_state.messages.append({"role": "user", "content": prompt})
    with st.chat_message("user"):
        st.markdown(prompt)
    
    with st.spinner("🔎 Retrieving response..."):
        retrieved_text = search_faiss(prompt, st.session_state["faiss_index"], st.session_state["chunks"], top_k=top_k)
        context = "\n".join(retrieved_text) if retrieved_text else "No relevant context found."
        
        response = query_groq(prompt, context)
        
        st.session_state.messages.append({"role": "assistant", "content": response})
        with st.chat_message("assistant"):
            st.markdown(response)