File size: 2,748 Bytes
883d858
 
 
 
48dcc48
883d858
 
 
 
48dcc48
883d858
 
 
48dcc48
883d858
 
 
 
 
 
 
 
 
 
 
 
 
 
48dcc48
 
 
 
883d858
 
 
48dcc48
883d858
 
 
 
 
 
 
48dcc48
 
 
883d858
 
48dcc48
 
 
 
883d858
48dcc48
883d858
48dcc48
883d858
 
 
 
48dcc48
883d858
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
from groq import Groq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from PyPDF2 import PdfReader
import docx
import streamlit as st

# ===================== Groq API Key =====================
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your_key_here")
client = Groq(api_key=GROQ_API_KEY)

# ===================== Helper Functions =====================
def read_pdf(file):
    pdf = PdfReader(file)
    text = ""
    for page in pdf.pages:
        text += page.extract_text()
    return text

def read_docx(file):
    doc = docx.Document(file)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

# ===================== Streamlit UI =====================
st.set_page_config(page_title="📄 RAG App with Groq", layout="wide")
st.title("📄 RAG App with Groq (Open-Source Embeddings)")

uploaded_file = st.file_uploader("Upload a document (PDF, DOCX, or TXT)", type=["pdf", "docx", "txt"])

if uploaded_file:
    # Extract text
    if uploaded_file.type == "application/pdf":
        raw_text = read_pdf(uploaded_file)
    elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        raw_text = read_docx(uploaded_file)
    else:
        raw_text = uploaded_file.read().decode("utf-8")

    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_text(raw_text)
    st.success(f"Document loaded and split into {len(chunks)} chunks.")

    # ===================== Open-Source Embeddings & FAISS =====================
    st.info("Embedding chunks for retrieval using open-source embeddings...")
    hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    faiss_index = FAISS.from_texts(chunks, hf_embeddings)

    # ===================== Query Section =====================
    query = st.text_input("Ask something about the document:")

    if query:
        docs = faiss_index.similarity_search(query, k=3)
        context = "\n".join([doc.page_content for doc in docs])

        # Groq LLM for answer generation
        response = client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Answer the following question using the context below:\nContext:\n{context}\n\nQuestion:\n{query}"}
            ]
        )
        answer = response.choices[0].message.content
        st.markdown(f"**Answer:** {answer}")