Spaces:

Zeeshan24
/

RAG-BASED-APPLICATION

Sleeping

File size: 3,321 Bytes

15da411

import os
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from groq import Groq
import requests
from io import BytesIO

# Hardcoded API Key
GROQ_API_KEY = "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"

# Function to download PDF from a URL
def download_pdf_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BytesIO(response.content)
    except requests.exceptions.RequestException as e:
        st.error(f"Failed to download PDF: {e}")
        return None

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Function to split text into chunks
def create_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Function to create embeddings
def create_embeddings(chunks, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks)
    return embeddings

# Function to store embeddings in FAISS
def store_embeddings_in_faiss(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return index

# Function to query FAISS index
def query_faiss(index, query_embedding, k=5):
    distances, indices = index.search(query_embedding, k)
    return indices

# Function to interact with Groq API
def send_query_to_groq(query):
    client = Groq(api_key=GROQ_API_KEY)
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": query}],
        model="llama3-8b-8192"
    )
    return response.choices[0].message.content

# Preload and process PDF links
def preload_pdfs(pdf_links):
    st.write("Downloading and processing PDFs...")
    all_chunks = []
    for url in pdf_links:
        pdf_file = download_pdf_from_url(url)
        if pdf_file:
            text = extract_text_from_pdf(pdf_file)
            chunks = create_chunks(text)
            all_chunks.extend(chunks)
    return all_chunks

# Streamlit UI
def main():
    st.title("RAG-based Application")

    # Predefined PDF links
    pdf_links = [
        "https://drive.google.com/uc?id=1hF6exN7tYScy-mxQAP5X9R_200X-ukMB",  # Add your links here
        # Add more links as needed
    ]

    # Preload PDFs and create embeddings
    chunks = preload_pdfs(pdf_links)
    embeddings = create_embeddings(chunks)
    index = store_embeddings_in_faiss(embeddings)

    st.success("All PDFs processed successfully! You can now ask questions.")

    # Input for user query
    query = st.text_input("Ask your question:")
    if query:
        st.write("Fetching relevant chunks...")
        query_embedding = create_embeddings([query])
        relevant_indices = query_faiss(index, query_embedding)
        relevant_texts = [chunks[i] for i in relevant_indices[0]]
        context = " ".join(relevant_texts)

        st.write("Sending query to Groq API...")
        response = send_query_to_groq(context)
        st.write("Response:", response)

if __name__ == "__main__":
    main()