import os import numpy as np import faiss import pytesseract from pdf2image import convert_from_path import requests import streamlit as st from groq import Groq # Set up Groq client client = Groq(api_key=os.environ.get("GROQ_API_KEY")) # Function to extract text from PDF def extract_text_from_pdf(pdf_path): images = convert_from_path(pdf_path) text = "" for page in images: text += pytesseract.image_to_string(page) return text # Function to chunk the text def create_chunks(text, chunk_size=200): words = text.split() chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] return chunks # Function to store chunks in FAISS (GPU enabled) def store_chunks_in_faiss(chunks): vector_dim = 768 # Assuming embeddings are 768-dimensional index = faiss.IndexFlatL2(vector_dim) # Move index to GPU if available res = faiss.StandardGpuResources() index = faiss.index_cpu_to_gpu(res, 0, index) # Generate dummy embeddings for demonstration embeddings = np.random.rand(len(chunks), vector_dim).astype("float32") index.add(embeddings) return index # Check if FAISS is using GPU def is_gpu_available(): return faiss.get_num_gpus() > 0 # Streamlit app interface st.title("PDF Content Chunking and Retrieval with FAISS-GPU") # PDF upload uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) if uploaded_file: st.write("Processing the uploaded file...") with open("uploaded_file.pdf", "wb") as f: f.write(uploaded_file.getbuffer()) # Extract text extracted_text = extract_text_from_pdf("uploaded_file.pdf") st.text_area("Extracted Text", extracted_text, height=200) # Chunk text st.write("Creating chunks...") chunks = create_chunks(extracted_text) st.write(f"Total chunks created: {len(chunks)}") # Store chunks in FAISS st.write("Storing chunks in FAISS...") index = store_chunks_in_faiss(chunks) if is_gpu_available(): st.success("FAISS is using GPU resources!") else: st.warning("FAISS is running on CPU.") st.write("Chunks successfully stored in the FAISS index!") # Interaction with Groq user_input = st.text_input("Ask a question about the content:") if user_input: st.write("Sending query to Groq API...") response = client.chat.completions.create( messages=[{"role": "user", "content": user_input}], model="llama-3.3-70b-versatile" ) st.text_area("Groq API Response", response.choices[0].message.content, height=100)