import os from groq import Groq from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from PyPDF2 import PdfReader import docx import streamlit as st # ===================== Groq API Key ===================== GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your_key_here") client = Groq(api_key=GROQ_API_KEY) # ===================== Helper Functions ===================== def read_pdf(file): pdf = PdfReader(file) text = "" for page in pdf.pages: text += page.extract_text() return text def read_docx(file): doc = docx.Document(file) text = "" for para in doc.paragraphs: text += para.text + "\n" return text # ===================== Streamlit UI ===================== st.set_page_config(page_title="📄 RAG App with Groq", layout="wide") st.title("📄 RAG App with Groq (Open-Source Embeddings)") uploaded_file = st.file_uploader("Upload a document (PDF, DOCX, or TXT)", type=["pdf", "docx", "txt"]) if uploaded_file: # Extract text if uploaded_file.type == "application/pdf": raw_text = read_pdf(uploaded_file) elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": raw_text = read_docx(uploaded_file) else: raw_text = uploaded_file.read().decode("utf-8") # Split text into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = text_splitter.split_text(raw_text) st.success(f"Document loaded and split into {len(chunks)} chunks.") # ===================== Open-Source Embeddings & FAISS ===================== st.info("Embedding chunks for retrieval using open-source embeddings...") hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") faiss_index = FAISS.from_texts(chunks, hf_embeddings) # ===================== Query Section ===================== query = st.text_input("Ask something about the document:") if query: docs = faiss_index.similarity_search(query, k=3) context = "\n".join([doc.page_content for doc in docs]) # Groq LLM for answer generation response = client.chat.completions.create( model="llama-3.3-70b-versatile", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": f"Answer the following question using the context below:\nContext:\n{context}\n\nQuestion:\n{query}"} ] ) answer = response.choices[0].message.content st.markdown(f"**Answer:** {answer}")