Google Drive RAG Application

import streamlit as st
import numpy as np
import faiss
import requests
import pdfplumber
from io import BytesIO
from sentence_transformers import SentenceTransformer
from groq import Groq
from urllib.parse import urlparse, parse_qs

# Initialize the embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize Groq API
API_KEY = "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"
client = Groq(api_key=API_KEY)

# Predefined Google Drive links
STORED_LINKS = [
    "https://drive.google.com/file/d/1zHtEpoEZv_3BhEDhQKkf1D1vya2jzyAd/view?usp=sharing",
    "https://drive.google.com/file/d/1xnRgDFGGV723Bgddf8KE9quwzpllgxyD/view?usp=sharing"
]

# Helper function to extract file ID from Google Drive URL
def extract_drive_file_id(url):
    parsed_url = urlparse(url)
    if 'drive.google.com' in parsed_url.netloc:
        return parse_qs(parsed_url.query).get('id', [None])[0] or parsed_url.path.split('/')[3]
    return None

# Helper function to download PDF from Google Drive
def download_pdf_from_drive(file_id):
    response = requests.get(f"https://drive.google.com/uc?id={file_id}&export=download")
    response.raise_for_status()
    return BytesIO(response.content)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        return ' '.join(page.extract_text() for page in pdf.pages if page.extract_text())

# Function to create embeddings and store them in FAISS
def create_embeddings(text):
    chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
    embeddings = embed_model.encode(chunks)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return chunks, embeddings, index

# Function to find the most relevant chunk for the user's question
def get_relevant_chunk(question, embeddings, index, chunks):
    question_embedding = embed_model.encode([question])
    D, I = index.search(np.array(question_embedding).astype(np.float32), 1)  # Retrieve top 1 chunk
    relevant_chunk = chunks[I[0][0]]
    return relevant_chunk

# Function to get the model's response from Groq API
def get_answer_from_groq(question, context):
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "user", "content": f"Answer the following question based on the context:\nContext: {context}\nQuestion: {question}"}
        ],
        model="llama3-8b-8192",
    )
    return chat_completion.choices[0].message.content

# Streamlit app
def main():
    st.set_page_config(page_title="Google Drive RAG App", page_icon="📄", layout="centered")
    st.markdown("<h1 style='text-align: center;'>Google Drive RAG Application</h1>", unsafe_allow_html=True)

    st.write("Processing predefined document links from Google Drive to generate embeddings stored in a FAISS index.")
    
    # Process predefined links
    all_text = ""
    for link in STORED_LINKS:
        try:
            file_id = extract_drive_file_id(link)
            if file_id:
                st.write(f"📥 Processing document: {link}")
                pdf_file = download_pdf_from_drive(file_id)
                text = extract_text_from_pdf(pdf_file)
                all_text += text
            else:
                st.warning(f"⚠️ Invalid link: {link}")
        except Exception as e:
            st.error(f"❌ Failed to process link: {link}. Error: {e}")
    
    if all_text:
        st.success("✅ All documents processed successfully!")
        
        # Create embeddings
        st.write("🔄 Creating embeddings...")
        chunks, embeddings, index = create_embeddings(all_text)
        st.success("✅ Embeddings created and stored in FAISS index!")
        
        # Question section
        question = st.text_input("Ask a question based on the uploaded documents:")
        if question:
            relevant_chunk = get_relevant_chunk(question, embeddings, index, chunks)
            st.write("🔄 Retrieving the answer...")
            answer = get_answer_from_groq(question, relevant_chunk)
            st.subheader("Answer:")
            st.write(answer)

if __name__ == "__main__":
    main()