engrrifatullah's picture
Update app.py
1ff4cc0 verified
import streamlit as st
import numpy as np
import faiss
import requests
import pdfplumber
from io import BytesIO
from sentence_transformers import SentenceTransformer
from groq import Groq
from urllib.parse import urlparse, parse_qs
# Initialize the embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# Initialize Groq API
API_KEY = "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"
client = Groq(api_key=API_KEY)
# Predefined Google Drive links
STORED_LINKS = [
"https://drive.google.com/file/d/1zHtEpoEZv_3BhEDhQKkf1D1vya2jzyAd/view?usp=sharing",
"https://drive.google.com/file/d/1xnRgDFGGV723Bgddf8KE9quwzpllgxyD/view?usp=sharing"
]
# Helper function to extract file ID from Google Drive URL
def extract_drive_file_id(url):
parsed_url = urlparse(url)
if 'drive.google.com' in parsed_url.netloc:
return parse_qs(parsed_url.query).get('id', [None])[0] or parsed_url.path.split('/')[3]
return None
# Helper function to download PDF from Google Drive
def download_pdf_from_drive(file_id):
response = requests.get(f"https://drive.google.com/uc?id={file_id}&export=download")
response.raise_for_status()
return BytesIO(response.content)
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
return ' '.join(page.extract_text() for page in pdf.pages if page.extract_text())
# Function to create embeddings and store them in FAISS
def create_embeddings(text):
chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
embeddings = embed_model.encode(chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
return chunks, embeddings, index
# Function to find the most relevant chunk for the user's question
def get_relevant_chunk(question, embeddings, index, chunks):
question_embedding = embed_model.encode([question])
D, I = index.search(np.array(question_embedding).astype(np.float32), 1) # Retrieve top 1 chunk
relevant_chunk = chunks[I[0][0]]
return relevant_chunk
# Function to get the model's response from Groq API
def get_answer_from_groq(question, context):
chat_completion = client.chat.completions.create(
messages=[
{"role": "user", "content": f"Answer the following question based on the context:\nContext: {context}\nQuestion: {question}"}
],
model="llama3-8b-8192",
)
return chat_completion.choices[0].message.content
# Streamlit app
def main():
st.set_page_config(page_title="Google Drive RAG App", page_icon="πŸ“„", layout="centered")
st.markdown("<h1 style='text-align: center;'>Google Drive RAG Application</h1>", unsafe_allow_html=True)
st.write("Processing predefined document links from Google Drive to generate embeddings stored in a FAISS index.")
# Process predefined links
all_text = ""
for link in STORED_LINKS:
try:
file_id = extract_drive_file_id(link)
if file_id:
st.write(f"πŸ“₯ Processing document: {link}")
pdf_file = download_pdf_from_drive(file_id)
text = extract_text_from_pdf(pdf_file)
all_text += text
else:
st.warning(f"⚠️ Invalid link: {link}")
except Exception as e:
st.error(f"❌ Failed to process link: {link}. Error: {e}")
if all_text:
st.success("βœ… All documents processed successfully!")
# Create embeddings
st.write("πŸ”„ Creating embeddings...")
chunks, embeddings, index = create_embeddings(all_text)
st.success("βœ… Embeddings created and stored in FAISS index!")
# Question section
question = st.text_input("Ask a question based on the uploaded documents:")
if question:
relevant_chunk = get_relevant_chunk(question, embeddings, index, chunks)
st.write("πŸ”„ Retrieving the answer...")
answer = get_answer_from_groq(question, relevant_chunk)
st.subheader("Answer:")
st.write(answer)
if __name__ == "__main__":
main()