Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import os | |
| from groq import Groq | |
| import fitz # PyMuPDF for PDF parsing | |
| import numpy as np | |
| import faiss | |
| from sentence_transformers import SentenceTransformer # Hugging Face transformer | |
| from io import BytesIO # To handle file upload correctly | |
| # Initialize the Hugging Face model and Groq API client | |
| model = SentenceTransformer('all-MiniLM-L6-v2') # Model for generating embeddings | |
| GROQ_API_KEY = "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" | |
| client = Groq(api_key=GROQ_API_KEY) | |
| # Function to extract text from a PDF | |
| def extract_text_from_pdf(file): | |
| doc = fitz.open(stream=file.read(), filetype="pdf") # Use the stream and specify file type | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| # Function to generate embeddings using Hugging Face model (for text retrieval) | |
| def generate_huggingface_embeddings(text): | |
| embeddings = model.encode(text) # Using the SentenceTransformer model | |
| return embeddings | |
| # Function to get relevant chunks from the document using FAISS similarity search | |
| def get_relevant_chunks(query, top_k=5): | |
| query_embedding = generate_huggingface_embeddings(query) # Get query embedding | |
| query_embedding = np.array(query_embedding).reshape(1, -1) # Reshape for FAISS | |
| # Perform similarity search in FAISS | |
| distances, indices = index.search(query_embedding, top_k) | |
| relevant_chunks = [document_chunks[i] for i in indices[0]] | |
| return relevant_chunks | |
| # Function to generate an answer based on retrieved context and Groq's model | |
| def generate_answer(query): | |
| relevant_chunks = get_relevant_chunks(query) | |
| context = " ".join(relevant_chunks) # Combine the most relevant chunks | |
| # Generate the response with Groq's chat model | |
| chat_completion = client.chat.completions.create( | |
| messages=[{"role": "user", "content": f"Answer based on this: {context}"}], | |
| model="llama3-8b-8192", # Adjust with the appropriate Groq model | |
| stream=False | |
| ) | |
| return chat_completion.choices[0].message.content | |
| # Streamlit app interface | |
| st.title("Knowledge-Based Assistant") | |
| st.write("Upload a PDF to generate answers based on its content.") | |
| # Upload PDF file | |
| pdf_file = st.file_uploader("Choose a PDF file", type="pdf") | |
| if pdf_file is not None: | |
| # Extract the text content from the uploaded PDF | |
| document_text = extract_text_from_pdf(pdf_file) | |
| # Split the document into chunks (adjust chunk size as needed) | |
| chunk_size = 1000 # Size of each chunk of text for embedding | |
| document_chunks = [document_text[i:i+chunk_size] for i in range(0, len(document_text), chunk_size)] | |
| # Generate embeddings for each chunk and store them | |
| embeddings = [generate_huggingface_embeddings(chunk) for chunk in document_chunks] | |
| # Convert embeddings to numpy arrays for FAISS | |
| embeddings_array = np.array(embeddings) | |
| # Initialize FAISS index | |
| index = faiss.IndexFlatL2(embeddings_array.shape[1]) # L2 distance metric | |
| # Add embeddings to the FAISS index | |
| index.add(embeddings_array) | |
| # Query input from user | |
| query = st.text_input("Ask a question about the document:") | |
| if query: | |
| # Generate the answer based on the query | |
| answer = generate_answer(query) | |
| st.write("Answer: ", answer) | |