Spaces:
Sleeping
Sleeping
File size: 2,748 Bytes
883d858 48dcc48 883d858 48dcc48 883d858 48dcc48 883d858 48dcc48 883d858 48dcc48 883d858 48dcc48 883d858 48dcc48 883d858 48dcc48 883d858 48dcc48 883d858 48dcc48 883d858 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | import os
from groq import Groq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from PyPDF2 import PdfReader
import docx
import streamlit as st
# ===================== Groq API Key =====================
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your_key_here")
client = Groq(api_key=GROQ_API_KEY)
# ===================== Helper Functions =====================
def read_pdf(file):
pdf = PdfReader(file)
text = ""
for page in pdf.pages:
text += page.extract_text()
return text
def read_docx(file):
doc = docx.Document(file)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
# ===================== Streamlit UI =====================
st.set_page_config(page_title="📄 RAG App with Groq", layout="wide")
st.title("📄 RAG App with Groq (Open-Source Embeddings)")
uploaded_file = st.file_uploader("Upload a document (PDF, DOCX, or TXT)", type=["pdf", "docx", "txt"])
if uploaded_file:
# Extract text
if uploaded_file.type == "application/pdf":
raw_text = read_pdf(uploaded_file)
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
raw_text = read_docx(uploaded_file)
else:
raw_text = uploaded_file.read().decode("utf-8")
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(raw_text)
st.success(f"Document loaded and split into {len(chunks)} chunks.")
# ===================== Open-Source Embeddings & FAISS =====================
st.info("Embedding chunks for retrieval using open-source embeddings...")
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
faiss_index = FAISS.from_texts(chunks, hf_embeddings)
# ===================== Query Section =====================
query = st.text_input("Ask something about the document:")
if query:
docs = faiss_index.similarity_search(query, k=3)
context = "\n".join([doc.page_content for doc in docs])
# Groq LLM for answer generation
response = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"Answer the following question using the context below:\nContext:\n{context}\n\nQuestion:\n{query}"}
]
)
answer = response.choices[0].message.content
st.markdown(f"**Answer:** {answer}")
|