pdf_query_app / app.py
NaseemTahir's picture
Update app.py
60ff4c1 verified
import os
from io import BytesIO
import streamlit as st
from PyPDF2 import PdfReader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from groq import Groq
import tempfile
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# Initialize Groq API Client
client = Groq(api_key="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")
# Helper Functions
def extract_text_from_pdf(pdf_file):
"""Extract text from uploaded PDF file."""
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text: # Ensure we don't add None
text += page_text
return text
def create_chunks(text, chunk_size=500):
"""Chunk the text into smaller pieces for processing."""
return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
def create_embeddings(chunks):
"""Create embeddings for text chunks using SentenceTransformers."""
if not chunks:
raise ValueError("No text chunks provided for embedding.")
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)
# Ensure embeddings is 2D even for single chunk
if len(embeddings.shape) == 1:
embeddings = np.expand_dims(embeddings, axis=0)
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings.astype('float32')) # FAISS expects float32
return faiss_index
def interact_with_model(query, faiss_index, chunks):
"""Interact with the model using a query and FAISS index."""
model = SentenceTransformer("all-MiniLM-L6-v2")
query_embedding = model.encode([query])
# Search FAISS index
distances, indices = faiss_index.search(query_embedding.astype('float32'), k=3)
# Retrieve relevant chunks
docs = [chunks[i] for i in indices[0] if i < len(chunks)]
context = " ".join(docs)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"Context: {context}\n\n{query}"},
]
chat_completion = client.chat.completions.create(
messages=messages, model="llama-3.3-70b-versatile"
)
return chat_completion.choices[0].message.content
# Streamlit Frontend
def main():
st.title("PDF Query App")
uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
if uploaded_file is not None:
text = extract_text_from_pdf(uploaded_file)
if not text.strip():
st.error("PDF contains no extractable text. Upload a valid PDF.")
return
chunks = create_chunks(text)
if not chunks:
st.error("No text chunks created. Check PDF content.")
return
try:
faiss_index = create_embeddings(chunks)
except Exception as e:
st.error(f"Error creating embeddings: {str(e)}")
return
query = st.text_input("Ask a question about the PDF:")
if query:
response = interact_with_model(query, faiss_index, chunks)
st.write(response)
if __name__ == "__main__":
main()