import os import streamlit as st from PyPDF2 import PdfReader from sentence_transformers import SentenceTransformer import faiss import numpy as np from groq import Groq import requests from io import BytesIO # Hardcoded API Key GROQ_API_KEY = "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" # Function to download PDF from a URL def download_pdf_from_url(url): try: response = requests.get(url) response.raise_for_status() return BytesIO(response.content) except requests.exceptions.RequestException as e: st.error(f"Failed to download PDF: {e}") return None # Function to extract text from PDF def extract_text_from_pdf(pdf_file): reader = PdfReader(pdf_file) text = "" for page in reader.pages: text += page.extract_text() return text # Function to split text into chunks def create_chunks(text, chunk_size=500): words = text.split() chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] return chunks # Function to create embeddings def create_embeddings(chunks, model_name='all-MiniLM-L6-v2'): model = SentenceTransformer(model_name) embeddings = model.encode(chunks) return embeddings # Function to store embeddings in FAISS def store_embeddings_in_faiss(embeddings): dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(np.array(embeddings)) return index # Function to query FAISS index def query_faiss(index, query_embedding, k=5): distances, indices = index.search(query_embedding, k) return indices # Function to interact with Groq API def send_query_to_groq(query): client = Groq(api_key=GROQ_API_KEY) response = client.chat.completions.create( messages=[{"role": "user", "content": query}], model="llama3-8b-8192" ) return response.choices[0].message.content # Preload and process PDF links def preload_pdfs(pdf_links): st.write("Downloading and processing PDFs...") all_chunks = [] for url in pdf_links: pdf_file = download_pdf_from_url(url) if pdf_file: text = extract_text_from_pdf(pdf_file) chunks = create_chunks(text) all_chunks.extend(chunks) return all_chunks # Streamlit UI def main(): st.title("RAG-based Application") # Predefined PDF links pdf_links = [ "https://drive.google.com/uc?id=1hF6exN7tYScy-mxQAP5X9R_200X-ukMB", # Add your links here # Add more links as needed ] # Preload PDFs and create embeddings chunks = preload_pdfs(pdf_links) embeddings = create_embeddings(chunks) index = store_embeddings_in_faiss(embeddings) st.success("All PDFs processed successfully! You can now ask questions.") # Input for user query query = st.text_input("Ask your question:") if query: st.write("Fetching relevant chunks...") query_embedding = create_embeddings([query]) relevant_indices = query_faiss(index, query_embedding) relevant_texts = [chunks[i] for i in relevant_indices[0]] context = " ".join(relevant_texts) st.write("Sending query to Groq API...") response = send_query_to_groq(context) st.write("Response:", response) if __name__ == "__main__": main()