khababakhtar's picture
Update app.py
0b21087 verified
import os
import numpy as np
import faiss
import pytesseract
from pdf2image import convert_from_path
import requests
import streamlit as st
from groq import Groq
# Set up Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
images = convert_from_path(pdf_path)
text = ""
for page in images:
text += pytesseract.image_to_string(page)
return text
# Function to chunk the text
def create_chunks(text, chunk_size=200):
words = text.split()
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
# Function to store chunks in FAISS (GPU enabled)
def store_chunks_in_faiss(chunks):
vector_dim = 768 # Assuming embeddings are 768-dimensional
index = faiss.IndexFlatL2(vector_dim)
# Move index to GPU if available
res = faiss.StandardGpuResources()
index = faiss.index_cpu_to_gpu(res, 0, index)
# Generate dummy embeddings for demonstration
embeddings = np.random.rand(len(chunks), vector_dim).astype("float32")
index.add(embeddings)
return index
# Check if FAISS is using GPU
def is_gpu_available():
return faiss.get_num_gpus() > 0
# Streamlit app interface
st.title("PDF Content Chunking and Retrieval with FAISS-GPU")
# PDF upload
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file:
st.write("Processing the uploaded file...")
with open("uploaded_file.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
# Extract text
extracted_text = extract_text_from_pdf("uploaded_file.pdf")
st.text_area("Extracted Text", extracted_text, height=200)
# Chunk text
st.write("Creating chunks...")
chunks = create_chunks(extracted_text)
st.write(f"Total chunks created: {len(chunks)}")
# Store chunks in FAISS
st.write("Storing chunks in FAISS...")
index = store_chunks_in_faiss(chunks)
if is_gpu_available():
st.success("FAISS is using GPU resources!")
else:
st.warning("FAISS is running on CPU.")
st.write("Chunks successfully stored in the FAISS index!")
# Interaction with Groq
user_input = st.text_input("Ask a question about the content:")
if user_input:
st.write("Sending query to Groq API...")
response = client.chat.completions.create(
messages=[{"role": "user", "content": user_input}],
model="llama-3.3-70b-versatile"
)
st.text_area("Groq API Response", response.choices[0].message.content, height=100)