Spaces:

anasfsd123
/

rag

Sleeping

App Files Files Community

rag / app.py

anasfsd123

Update app.py

48dcc48 verified 6 months ago

raw

history blame contribute delete

2.75 kB

	import os
	from groq import Groq
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from PyPDF2 import PdfReader
	import docx
	import streamlit as st

	# ===================== Groq API Key =====================
	GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your_key_here")
	client = Groq(api_key=GROQ_API_KEY)

	# ===================== Helper Functions =====================
	def read_pdf(file):
	pdf = PdfReader(file)
	text = ""
	for page in pdf.pages:
	text += page.extract_text()
	return text

	def read_docx(file):
	doc = docx.Document(file)
	text = ""
	for para in doc.paragraphs:
	text += para.text + "\n"
	return text

	# ===================== Streamlit UI =====================
	st.set_page_config(page_title="📄 RAG App with Groq", layout="wide")
	st.title("📄 RAG App with Groq (Open-Source Embeddings)")

	uploaded_file = st.file_uploader("Upload a document (PDF, DOCX, or TXT)", type=["pdf", "docx", "txt"])

	if uploaded_file:
	# Extract text
	if uploaded_file.type == "application/pdf":
	raw_text = read_pdf(uploaded_file)
	elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	raw_text = read_docx(uploaded_file)
	else:
	raw_text = uploaded_file.read().decode("utf-8")

	# Split text into chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	chunks = text_splitter.split_text(raw_text)
	st.success(f"Document loaded and split into {len(chunks)} chunks.")

	# ===================== Open-Source Embeddings & FAISS =====================
	st.info("Embedding chunks for retrieval using open-source embeddings...")
	hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	faiss_index = FAISS.from_texts(chunks, hf_embeddings)

	# ===================== Query Section =====================
	query = st.text_input("Ask something about the document:")

	if query:
	docs = faiss_index.similarity_search(query, k=3)
	context = "\n".join([doc.page_content for doc in docs])

	# Groq LLM for answer generation
	response = client.chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": f"Answer the following question using the context below:\nContext:\n{context}\n\nQuestion:\n{query}"}
	]
	)
	answer = response.choices[0].message.content
	st.markdown(f"Answer: {answer}")