Spaces:
Build error
Build error
File size: 3,266 Bytes
188fa75 60ff4c1 188fa75 60ff4c1 188fa75 60ff4c1 188fa75 60ff4c1 188fa75 60ff4c1 188fa75 60ff4c1 188fa75 60ff4c1 188fa75 60ff4c1 188fa75 60ff4c1 188fa75 60ff4c1 188fa75 60ff4c1 188fa75 60ff4c1 188fa75 60ff4c1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | import os
from io import BytesIO
import streamlit as st
from PyPDF2 import PdfReader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from groq import Groq
import tempfile
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# Initialize Groq API Client
client = Groq(api_key="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")
# Helper Functions
def extract_text_from_pdf(pdf_file):
"""Extract text from uploaded PDF file."""
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text: # Ensure we don't add None
text += page_text
return text
def create_chunks(text, chunk_size=500):
"""Chunk the text into smaller pieces for processing."""
return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
def create_embeddings(chunks):
"""Create embeddings for text chunks using SentenceTransformers."""
if not chunks:
raise ValueError("No text chunks provided for embedding.")
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)
# Ensure embeddings is 2D even for single chunk
if len(embeddings.shape) == 1:
embeddings = np.expand_dims(embeddings, axis=0)
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings.astype('float32')) # FAISS expects float32
return faiss_index
def interact_with_model(query, faiss_index, chunks):
"""Interact with the model using a query and FAISS index."""
model = SentenceTransformer("all-MiniLM-L6-v2")
query_embedding = model.encode([query])
# Search FAISS index
distances, indices = faiss_index.search(query_embedding.astype('float32'), k=3)
# Retrieve relevant chunks
docs = [chunks[i] for i in indices[0] if i < len(chunks)]
context = " ".join(docs)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"Context: {context}\n\n{query}"},
]
chat_completion = client.chat.completions.create(
messages=messages, model="llama-3.3-70b-versatile"
)
return chat_completion.choices[0].message.content
# Streamlit Frontend
def main():
st.title("PDF Query App")
uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
if uploaded_file is not None:
text = extract_text_from_pdf(uploaded_file)
if not text.strip():
st.error("PDF contains no extractable text. Upload a valid PDF.")
return
chunks = create_chunks(text)
if not chunks:
st.error("No text chunks created. Check PDF content.")
return
try:
faiss_index = create_embeddings(chunks)
except Exception as e:
st.error(f"Error creating embeddings: {str(e)}")
return
query = st.text_input("Ask a question about the PDF:")
if query:
response = interact_with_model(query, faiss_index, chunks)
st.write(response)
if __name__ == "__main__":
main() |