import os import streamlit as st import requests from PyPDF2 import PdfReader from langchain_community.vectorstores import FAISS from langchain.embeddings.huggingface import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from groq import Groq # Hardcoded Google Drive link GOOGLE_DRIVE_LINK = "https://drive.google.com/file/d/1wv5gbGP0SA15BzoNUxprXhYx0jHhPgHl/view?usp=sharing" # Function to download the PDF from Google Drive def download_pdf(): file_id = GOOGLE_DRIVE_LINK.split("/d/")[1].split("/view")[0] url = f"https://drive.google.com/uc?id={file_id}&export=download" response = requests.get(url) with open("document.pdf", "wb") as f: f.write(response.content) return "document.pdf" # Function to extract text from PDF def extract_text_from_pdf(pdf_file): reader = PdfReader(pdf_file) text = "" for page in reader.pages: text += page.extract_text() return text # Function to create FAISS vector database def create_vector_db(text): text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = text_splitter.split_text(text) # Use Hugging Face Embeddings model_name = "all-MiniLM-L6-v2" embeddings = HuggingFaceEmbeddings(model_name=model_name) vector_db = FAISS.from_texts(chunks, embeddings) return vector_db # Function to query Groq API def query_groq_api(query, context, model="llama-3.3-70b-versatile"): # Define the Groq API key GROQ_API_KEY = "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" # Optionally set it as an environment variable (not necessary in this case) os.environ["GROQ_API_KEY"] = GROQ_API_KEY # API endpoint (Uncomment the URL) url = "https://api.groq.com/openai/v1/chat/completions" # Headers for the API request headers = { "Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}", # Retrieve from environment } # Data to send to the API data = { "model": model, "messages": [ {"role": "system", "content": "You are an intelligent assistant."}, {"role": "user", "content": f"Context: {context}\nQuestion: {query}"} ], } try: # Send POST request to Groq API response = requests.post(url, headers=headers, json=data) response.raise_for_status() # Raise an error for bad responses # Get the API response content result = response.json() # Extract the answer from the response return result.get("choices", [{}])[0].get("message", {}).get("content", "No response.") except requests.exceptions.RequestException as e: # Handle errors return f"Error: {e}" # Streamlit App st.title("PDF Book Querry and Response") # Persistent state to store vector database if "vector_db" not in st.session_state: st.session_state.vector_db = None # Process the hardcoded PDF link if st.button("Process PDF"): st.info("Downloading and processing the PDF...") pdf_file = download_pdf() pdf_text = extract_text_from_pdf(pdf_file) st.success("PDF processed successfully!") # Create FAISS vector database st.info("Creating vector database...") st.session_state.vector_db = create_vector_db(pdf_text) st.success("Vector database created!") # Query the document if st.session_state.vector_db: user_query = st.text_input("Ask a question about the document:") if st.button("Submit Query"): with st.spinner("Processing your query..."): # Retrieve similar text chunks similar_docs = st.session_state.vector_db.similarity_search(user_query, k=3) context = " ".join([doc.page_content for doc in similar_docs]) # Send query with context to Groq API response = query_groq_api(user_query, context) st.write("**Answer:**", response)