Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import faiss | |
| import tempfile | |
| from pypdf import PdfReader | |
| from sentence_transformers import SentenceTransformer | |
| from groq import Groq | |
| import numpy as np | |
| # Load environment key | |
| GROQ_API_KEY = os.environ.get(GROQ_API_KEY) | |
| # Load embedding model | |
| embed_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Set up Groq client | |
| client = Groq(api_key=GROQ_API_KEY) | |
| # Functions | |
| def extract_text_from_pdf(pdf_file): | |
| reader = PdfReader(pdf_file) | |
| return "\n".join(page.extract_text() for page in reader.pages) | |
| def chunk_text(text, chunk_size=500, overlap=50): | |
| words = text.split() | |
| chunks = [] | |
| for i in range(0, len(words), chunk_size - overlap): | |
| chunk = " ".join(words[i:i + chunk_size]) | |
| chunks.append(chunk) | |
| return chunks | |
| def embed_chunks(chunks): | |
| return embed_model.encode(chunks) | |
| def create_faiss_index(embeddings): | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| return index | |
| def query_groq(query, context): | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful assistant that answers questions based on the given context."}, | |
| {"role": "user", "content": f"Context: {context}\n\nQuestion: {query}"} | |
| ] | |
| chat_completion = client.chat.completions.create( | |
| messages=messages, | |
| model="llama3-8b-8192", | |
| ) | |
| return chat_completion.choices[0].message.content | |
| # Streamlit UI | |
| st.title("RAG PDF Q&A App (Groq + FAISS)") | |
| uploaded_file = st.file_uploader("Upload your PDF", type="pdf") | |
| query = st.text_input("Ask a question about the PDF") | |
| if uploaded_file: | |
| with tempfile.NamedTemporaryFile(delete=False) as tmp: | |
| tmp.write(uploaded_file.read()) | |
| text = extract_text_from_pdf(tmp.name) | |
| chunks = chunk_text(text) | |
| embeddings = embed_chunks(chunks) | |
| index = create_faiss_index(np.array(embeddings)) | |
| if query: | |
| query_embedding = embed_model.encode([query]) | |
| distances, indices = index.search(np.array(query_embedding), k=3) | |
| context = "\n\n".join([chunks[i] for i in indices[0]]) | |
| response = query_groq(query, context) | |
| st.subheader("Answer:") | |
| st.write(response) | |