Spaces:

shamilcoded
/

RagBaseApp

Sleeping

File size: 6,875 Bytes

1f8c160
 
69d986f
1f8c160
 
 
e6aac09
1f8c160
c04de34
7a29a17
b1f88af
 
 
 
 
 
 
 
 
3e42beb
a23ed3c
3e42beb
a23ed3c
3e42beb
 
a23ed3c
e6aac09
83a3779
aa45565
c04de34
a23ed3c
2ea0aa5
 
 
 
 
 
 
 
 
 
 
 
a23ed3c
c04de34
 
 
 
a23ed3c
c04de34
 
 
6b5eb7c
 
c04de34
83a3779
 
 
 
 
b1f88af
 
 
 
 
a23ed3c
458a679
a23ed3c
1f8c160
a23ed3c
1f8c160
a23ed3c
 
1f8c160
458a679
1f8c160
 
458a679
8d812ae
a23ed3c
1f8c160
458a679
1f8c160
 
 
458a679
a23ed3c
458a679
 
1f8c160
a23ed3c
c04de34
 
 
 
 
1f8c160
a23ed3c
1f8c160
 
3e42beb
 
 
a23ed3c
69d986f
458a679
69d986f
 
 
 
 
1f8c160
a23ed3c
1f8c160
458a679
1f8c160
 
 
 
 
a23ed3c
458a679
 
a23ed3c
3e42beb
69d986f
458a679
a23ed3c
69d986f
a23ed3c
 
69d986f
1f8c160
a23ed3c
1f8c160
a23ed3c
 
1f8c160
458a679
a23ed3c
c04de34
 
 
 
 
1f8c160
a23ed3c
67c0653
1f8c160
8193c02
 
 
 
a23ed3c
6b5eb7c
6420211
1f8c160
a23ed3c
6d92baa
 
 
a23ed3c
6d92baa
a23ed3c
 
6d92baa
 
a23ed3c
6d92baa
 
 
a23ed3c
2ea0aa5
6d92baa
2ea0aa5
 
 
a23ed3c
2ea0aa5
 
a23ed3c
6420211
2ea0aa5
a23ed3c
6420211
e6aac09
a23ed3c
 
 
 
 
e6aac09
a23ed3c
7f3b0ab
6420211
a23ed3c
6420211
 
aa45565
a23ed3c
6b5eb7c

import os
import streamlit as st
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from groq import Groq

# Set background image and customize colors
background_image_url = "https://cdn.pixabay.com/photo/2016/06/02/02/33/triangles-1430105_1280.png"
st.markdown(
    f"""
    <style>
    .stApp {{
        background-image: url("{background_image_url}");
        background-size: cover;
        background-position: center center;
        background-repeat: no-repeat;
    }}
    
    /* Ensure title is black */
    h1 {{
        color: black !important;  /* Force title color to black */
    }}
    
    /* Set footer text color to white */
    h2, h3, h4, h5, h6, p {{
        color: white;  /* Set all text color to white */
    }}
    
    /* Set footer styling */
    .footer {{
        position: fixed;
        bottom: 0;
        left: 0;
        right: 0;
        background-color: rgba(0, 0, 0, 0.6);
        color: white;
        text-align: center;
        padding: 10px 0;
        font-size: 14px;
    }}
    
    /* Set processing button color to green */
    .stButton button {{
        background-color: green;
        color: white;
    }}
    /* Set query input block background color to white */
    .stTextInput input {{
        background-color: white;
        color: black;
        border-radius: 5px;
        padding: 10px;
    }}
    
    /* Set all output text (retrieved chunks and responses) to white */
    .stMarkdown, .stTextInput, .stText, .stCode, .stJson, .stFileUploader, .stError, .stSuccess {{
        color: white !important;
    }}
    </style>
    """, 
    unsafe_allow_html=True
)

# Use your Groq API key from Hugging Face Secrets
HUGGINGFACE_KEY = os.getenv("HUGGINGFACE_KEY")

if not HUGGINGFACE_KEY:
    st.error("Groq API key not found. Please set it in Hugging Face Secrets.")

# Initialize Groq client with the correct API key
groq_client = Groq(api_key=HUGGINGFACE_KEY)

# Load the SentenceTransformer model for embedding generation
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Define file path and vector store folder
file_path = "The Rise of Agentic AI.pdf"  # File directly in the root directory of the app
VECTORSTORE_FOLDER = "vectorstore"  # Folder where the FAISS index will be stored

# Ensure the vector store folder exists
if not os.path.exists(VECTORSTORE_FOLDER):
    os.makedirs(VECTORSTORE_FOLDER)

# Define the vector store path
vectorstore_path = os.path.join(VECTORSTORE_FOLDER, "index.faiss")  # Correct path to the index file

# Load or create FAISS index
if os.path.exists(vectorstore_path):
    # If the index file exists, read it
    try:
        index = faiss.read_index(vectorstore_path)
    except Exception as e:
        st.error(f"Error reading the FAISS index: {e}")
        index = faiss.IndexFlatL2(embedder.get_sentence_embedding_dimension())
else:
    # If the index file doesn't exist, create a new one
    index = faiss.IndexFlatL2(embedder.get_sentence_embedding_dimension())

# Variable to hold chunks globally
chunks = []

# Function to load text from PDF
def load_pdf_text(file_path):
    """Extract text from the given PDF file."""
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to chunk text into smaller pieces
def chunk_text(text, chunk_size=500, overlap=100):
    """Chunk the text into overlapping chunks."""
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

# Process the document and update vector store
def process_and_store_document(file_path):
    """Process the PDF document, chunk text, generate embeddings, and store them in FAISS."""
    global chunks  # Make chunks global to access in the query part
    
    st.info("Processing PDF document...")
    
    # Extract text from the PDF file
    text = load_pdf_text(file_path)
    
    # Chunk the text into smaller pieces
    chunks = chunk_text(text)
    
    # Generate embeddings for each chunk
    embeddings = embedder.encode(chunks, show_progress_bar=True)
    
    # Add the embeddings to the FAISS index
    index.add(np.array(embeddings))
    
    # Save the updated FAISS index
    try:
        faiss.write_index(index, vectorstore_path)
        st.success("Document processed and vector store updated!")
    except Exception as e:
        st.error(f"Error saving the FAISS index: {e}")

# User interface for Streamlit
st.title("The Rise of Agentic AI RAG Application")

# Button to trigger document processing
if st.button("Process PDF"):
    process_and_store_document(file_path)

# Query input for the user
user_query = st.text_input("Enter your query:", key="query_input")

if user_query:
    # Check if there are any chunks in the index
    if not chunks:
        st.error("Please process the document first by clicking 'Process PDF'.")
    else:
        # Generate embedding for the user query
        query_embedding = embedder.encode([user_query])
        
        # Perform the search on the FAISS index
        distances, indices = index.search(np.array(query_embedding), k=5)
        
        # Check if the indices returned are valid
        if indices.size == 0 or np.any(indices[0] == -1):
            st.error("No relevant results found in the index.")
        else:
            # Ensure indices are within the bounds of the chunks list
            valid_indices = [idx for idx in indices[0] if idx < len(chunks)]
            
            if not valid_indices:
                st.error("No valid indices found for the retrieved chunks.")
            else:
                # Retrieve the most relevant chunks based on the valid indices
                retrieved_chunks = [chunks[idx] for idx in valid_indices]
                
                # Combine the retrieved chunks with the query and generate a response using Groq
                combined_input = " ".join(retrieved_chunks) + user_query
                
                # Generate a response with Groq
                try:
                    chat_completion = groq_client.chat.completions.create(
                        messages=[{
                            "role": "user",
                            "content": combined_input,
                        }],
                        model="llama3-8b-8192",  # Specify the model you want to use
                    )

                    # Display only the generated response
                    st.subheader("Generated Response")
                    st.write(chat_completion.choices[0].message.content)
                except Exception as e:
                    st.error(f"Error generating response: {e}")

# Footer
st.markdown("<div class='footer'>Created by Shamil Shahbaz</div>", unsafe_allow_html=True)