Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import faiss | |
| import numpy as np | |
| import fitz # PyMuPDF for PDF text extraction | |
| from sentence_transformers import SentenceTransformer | |
| from groq import Groq | |
| # Set up API key for Groq LLM | |
| GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "") | |
| if not GROQ_API_KEY: | |
| st.error("π¨ Groq API Key is missing! Set `GROQ_API_KEY` in the environment.") | |
| st.stop() | |
| # Initialize Groq Client | |
| client = Groq(api_key=GROQ_API_KEY) | |
| # Load Embedding Model | |
| embed_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| # Initialize FAISS Index | |
| embedding_size = 384 # Dimension of embeddings from MiniLM | |
| index = faiss.IndexFlatL2(embedding_size) | |
| documents = [] # To store text chunks | |
| # Function to extract text from PDF | |
| def extract_text_from_pdf(pdf_file): | |
| try: | |
| doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
| text = "\n".join([page.get_text("text") for page in doc]) | |
| return text | |
| except Exception as e: | |
| st.error(f"β Error extracting text: {e}") | |
| return "" | |
| # Function to split text into chunks | |
| def chunk_text(text, chunk_size=512): | |
| words = text.split() | |
| return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] | |
| # Function to store document embeddings in FAISS | |
| def store_embeddings(chunks): | |
| global documents, index | |
| embeddings = embed_model.encode(chunks) | |
| index.add(np.array(embeddings).astype("float32")) | |
| documents.extend(chunks) | |
| # Function to retrieve relevant chunks from FAISS | |
| def retrieve_relevant_chunks(query, top_k=3): | |
| if index.ntotal == 0: | |
| return [] | |
| query_embedding = embed_model.encode([query]).astype("float32") | |
| distances, indices = index.search(query_embedding, top_k) | |
| return [documents[i] for i in indices[0] if i < len(documents)] | |
| # Function to query Groq API with retrieved context | |
| def ask_groq(question, context): | |
| prompt = f"Context: {context}\nQuestion: {question}\nAnswer:" | |
| try: | |
| response = client.chat.completions.create( | |
| messages=[{"role": "user", "content": prompt}], | |
| model="llama-3.3-70b-versatile", | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| return f"β Error generating response: {e}" | |
| # Streamlit UI | |
| st.set_page_config(page_title="RAG Q&A with Groq", page_icon="π", layout="wide") | |
| st.title("π RAG-based Q&A with Open Source LLM & FAISS") | |
| st.write("Upload a **PDF document**, then ask questions based on its content!") | |
| uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
| if uploaded_file: | |
| with st.spinner("Extracting text from PDF..."): | |
| pdf_text = extract_text_from_pdf(uploaded_file) | |
| if pdf_text: | |
| with st.spinner("Chunking and embedding document..."): | |
| chunks = chunk_text(pdf_text) | |
| store_embeddings(chunks) | |
| st.success("β Document processed! You can now ask questions.") | |
| question = st.text_input("Ask a question from the document:", "") | |
| if st.button("Get Answer"): | |
| if question: | |
| if index.ntotal == 0: | |
| st.warning("β οΈ No document uploaded! Please upload a PDF first.") | |
| else: | |
| with st.spinner("Retrieving relevant context..."): | |
| context = " ".join(retrieve_relevant_chunks(question)) | |
| with st.spinner("Generating answer using Groq LLM..."): | |
| answer = ask_groq(question, context) | |
| st.success("Answer:") | |
| st.write(answer) | |
| else: | |
| st.warning("β οΈ Please enter a question!") | |