Spaces:
Build error
Build error
| import os | |
| from io import BytesIO | |
| import streamlit as st | |
| from PyPDF2 import PdfReader | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from groq import Groq | |
| import tempfile | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| # Initialize Groq API Client | |
| client = Groq(api_key="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") | |
| # Helper Functions | |
| def extract_text_from_pdf(pdf_file): | |
| """Extract text from uploaded PDF file.""" | |
| reader = PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: # Ensure we don't add None | |
| text += page_text | |
| return text | |
| def create_chunks(text, chunk_size=500): | |
| """Chunk the text into smaller pieces for processing.""" | |
| return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] | |
| def create_embeddings(chunks): | |
| """Create embeddings for text chunks using SentenceTransformers.""" | |
| if not chunks: | |
| raise ValueError("No text chunks provided for embedding.") | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| embeddings = model.encode(chunks) | |
| # Ensure embeddings is 2D even for single chunk | |
| if len(embeddings.shape) == 1: | |
| embeddings = np.expand_dims(embeddings, axis=0) | |
| dimension = embeddings.shape[1] | |
| faiss_index = faiss.IndexFlatL2(dimension) | |
| faiss_index.add(embeddings.astype('float32')) # FAISS expects float32 | |
| return faiss_index | |
| def interact_with_model(query, faiss_index, chunks): | |
| """Interact with the model using a query and FAISS index.""" | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| query_embedding = model.encode([query]) | |
| # Search FAISS index | |
| distances, indices = faiss_index.search(query_embedding.astype('float32'), k=3) | |
| # Retrieve relevant chunks | |
| docs = [chunks[i] for i in indices[0] if i < len(chunks)] | |
| context = " ".join(docs) | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": f"Context: {context}\n\n{query}"}, | |
| ] | |
| chat_completion = client.chat.completions.create( | |
| messages=messages, model="llama-3.3-70b-versatile" | |
| ) | |
| return chat_completion.choices[0].message.content | |
| # Streamlit Frontend | |
| def main(): | |
| st.title("PDF Query App") | |
| uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"]) | |
| if uploaded_file is not None: | |
| text = extract_text_from_pdf(uploaded_file) | |
| if not text.strip(): | |
| st.error("PDF contains no extractable text. Upload a valid PDF.") | |
| return | |
| chunks = create_chunks(text) | |
| if not chunks: | |
| st.error("No text chunks created. Check PDF content.") | |
| return | |
| try: | |
| faiss_index = create_embeddings(chunks) | |
| except Exception as e: | |
| st.error(f"Error creating embeddings: {str(e)}") | |
| return | |
| query = st.text_input("Ask a question about the PDF:") | |
| if query: | |
| response = interact_with_model(query, faiss_index, chunks) | |
| st.write(response) | |
| if __name__ == "__main__": | |
| main() |