File size: 4,648 Bytes
a0fff8d 8f12436 a1271a1 8f12436 fff1c67 1f03038 54a868c 8f12436 a0fff8d 8f12436 a0fff8d 8f12436 a0fff8d 8f12436 a1271a1 a0fff8d fff1c67 a1271a1 fff1c67 a0fff8d fff1c67 a0fff8d fff1c67 a0fff8d fff1c67 a0fff8d fff1c67 a1271a1 fff1c67 a1271a1 fff1c67 a0fff8d fff1c67 a0fff8d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | import numpy # Ensure NumPy is loaded first to avoid FAISS issues
import faiss # Load FAISS after NumPy
import os
import streamlit as st
import pandas as pd
import pdfplumber
from sentence_transformers import SentenceTransformer
from groq import Groq
import numpy as np
# API key for Groq
API_KEY = "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"
client = Groq(api_key=API_KEY)
# Initialize the embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
return ' '.join(page.extract_text() for page in pdf.pages)
# Function to create embeddings and store them in FAISS
def create_embeddings(text):
chunks = [text[i:i+500] for i in range(0, len(text), 500)]
embeddings = embed_model.encode(chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
return chunks, embeddings, index
# Function to find the most relevant chunk for the user's question
def get_relevant_chunk(question, embeddings, index, chunks):
question_embedding = embed_model.encode([question])
D, I = index.search(np.array(question_embedding).astype(np.float32), 1) # Retrieve top 1 chunk
relevant_chunk = chunks[I[0][0]] # The chunk corresponding to the closest embedding
return relevant_chunk
# Function to get the model's response from Groq API
def get_answer_from_groq(question, context):
chat_completion = client.chat.completions.create(
messages=[
{"role": "user", "content": f"Answer the following question based on the context:\nContext: {context}\nQuestion: {question}"}
],
model="llama3-8b-8192",
)
return chat_completion.choices[0].message.content
# Streamlit app
def main():
st.set_page_config(
page_title="RAG Based Application",
page_icon="π",
layout="centered",
)
# Custom CSS for styling
st.markdown(
"""
<style>
body {
background-color: #f4f7f9;
}
.main-header {
font-size: 2.5rem;
color: #1d3557;
text-align: center;
margin-bottom: 1rem;
}
.upload-box {
border: 2px dashed #457b9d;
border-radius: 10px;
padding: 1rem;
text-align: center;
background-color: #f1faee;
}
</style>
""",
unsafe_allow_html=True,
)
# App title and description
st.markdown('<div class="main-header">RAG Based Application</div>', unsafe_allow_html=True)
st.write("Upload your document (PDF, CSV, or Excel) to process and generate embeddings stored in a FAISS index.")
# File upload section
uploaded_file = st.file_uploader("Drag and drop or browse files", type=["pdf", "csv", "xlsx"])
if uploaded_file:
# Identify file type
file_type = uploaded_file.type
st.markdown('<div class="upload-box">File Uploaded Successfully!</div>', unsafe_allow_html=True)
# Extract content
if file_type == "application/pdf":
text = extract_text_from_pdf(uploaded_file)
elif file_type in ["text/csv", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
df = pd.read_csv(uploaded_file) if file_type == "text/csv" else pd.read_excel(uploaded_file)
text = df.to_string()
# Display content
st.subheader("Document Content:")
st.text_area("Extracted Text", text, height=300)
# Create embeddings
st.write("π Creating embeddings... This may take a moment.")
chunks, embeddings, index = create_embeddings(text)
st.success("β
Embeddings created and stored in FAISS index!")
# Question Section
question = st.text_input("Ask a question based on the uploaded document:")
if question:
# Retrieve the most relevant chunk for the question
relevant_chunk = get_relevant_chunk(question, embeddings, index, chunks)
# Get the model's answer based on the relevant chunk
st.write("π Retrieving the answer...")
answer = get_answer_from_groq(question, relevant_chunk)
# Display the answer
st.subheader("Answer:")
st.write(answer)
# Summary Section
st.subheader("Process Summary:")
st.write("- Uploaded file type:", file_type)
st.write("- Number of chunks processed:", len(text) // 500 + 1)
# Run the app
if __name__ == "__main__":
main()
|