File size: 4,648 Bytes
a0fff8d
 
8f12436
 
 
 
 
 
a1271a1
8f12436
fff1c67
1f03038
54a868c
 
 
8f12436
 
 
 
 
a0fff8d
8f12436
a0fff8d
 
 
8f12436
a0fff8d
8f12436
a1271a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0fff8d
 
 
fff1c67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1271a1
fff1c67
 
a0fff8d
 
fff1c67
a0fff8d
fff1c67
 
 
a0fff8d
 
 
 
 
 
fff1c67
 
a0fff8d
 
fff1c67
 
a1271a1
fff1c67
 
a1271a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fff1c67
 
 
 
a0fff8d
fff1c67
a0fff8d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy  # Ensure NumPy is loaded first to avoid FAISS issues
import faiss  # Load FAISS after NumPy
import os
import streamlit as st
import pandas as pd
import pdfplumber
from sentence_transformers import SentenceTransformer
from groq import Groq
import numpy as np

# API key for Groq
API_KEY = "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"
client = Groq(api_key=API_KEY)

# Initialize the embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        return ' '.join(page.extract_text() for page in pdf.pages)

# Function to create embeddings and store them in FAISS
def create_embeddings(text):
    chunks = [text[i:i+500] for i in range(0, len(text), 500)]
    embeddings = embed_model.encode(chunks)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return chunks, embeddings, index

# Function to find the most relevant chunk for the user's question
def get_relevant_chunk(question, embeddings, index, chunks):
    question_embedding = embed_model.encode([question])
    D, I = index.search(np.array(question_embedding).astype(np.float32), 1)  # Retrieve top 1 chunk
    relevant_chunk = chunks[I[0][0]]  # The chunk corresponding to the closest embedding
    return relevant_chunk

# Function to get the model's response from Groq API
def get_answer_from_groq(question, context):
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "user", "content": f"Answer the following question based on the context:\nContext: {context}\nQuestion: {question}"}
        ],
        model="llama3-8b-8192",
    )
    return chat_completion.choices[0].message.content

# Streamlit app
def main():
    st.set_page_config(
        page_title="RAG Based Application",
        page_icon="πŸ“„",
        layout="centered",
    )

    # Custom CSS for styling
    st.markdown(
        """
        <style>
        body {
            background-color: #f4f7f9;
        }
        .main-header {
            font-size: 2.5rem;
            color: #1d3557;
            text-align: center;
            margin-bottom: 1rem;
        }
        .upload-box {
            border: 2px dashed #457b9d;
            border-radius: 10px;
            padding: 1rem;
            text-align: center;
            background-color: #f1faee;
        }
        </style>
        """,
        unsafe_allow_html=True,
    )

    # App title and description
    st.markdown('<div class="main-header">RAG Based Application</div>', unsafe_allow_html=True)
    st.write("Upload your document (PDF, CSV, or Excel) to process and generate embeddings stored in a FAISS index.")
    
    # File upload section
    uploaded_file = st.file_uploader("Drag and drop or browse files", type=["pdf", "csv", "xlsx"])

    if uploaded_file:
        # Identify file type
        file_type = uploaded_file.type
        st.markdown('<div class="upload-box">File Uploaded Successfully!</div>', unsafe_allow_html=True)

        # Extract content
        if file_type == "application/pdf":
            text = extract_text_from_pdf(uploaded_file)
        elif file_type in ["text/csv", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
            df = pd.read_csv(uploaded_file) if file_type == "text/csv" else pd.read_excel(uploaded_file)
            text = df.to_string()

        # Display content
        st.subheader("Document Content:")
        st.text_area("Extracted Text", text, height=300)

        # Create embeddings
        st.write("πŸ”„ Creating embeddings... This may take a moment.")
        chunks, embeddings, index = create_embeddings(text)
        st.success("βœ… Embeddings created and stored in FAISS index!")

        # Question Section
        question = st.text_input("Ask a question based on the uploaded document:")
        
        if question:
            # Retrieve the most relevant chunk for the question
            relevant_chunk = get_relevant_chunk(question, embeddings, index, chunks)

            # Get the model's answer based on the relevant chunk
            st.write("πŸ”„ Retrieving the answer...")
            answer = get_answer_from_groq(question, relevant_chunk)
            
            # Display the answer
            st.subheader("Answer:")
            st.write(answer)

        # Summary Section
        st.subheader("Process Summary:")
        st.write("- Uploaded file type:", file_type)
        st.write("- Number of chunks processed:", len(text) // 500 + 1)

# Run the app
if __name__ == "__main__":
    main()