Spaces:

RizwanSajad
/

Electrical_RAG_Based_App

Runtime error

App Files Files Community

RizwanSajad commited on Jan 2, 2025

Commit

67b6dfd

verified ·

1 Parent(s): 73b110f

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -45

app.py CHANGED Viewed

@@ -3,82 +3,95 @@ import streamlit as st
 import numpy as np
 import faiss
 from groq import Groq
-from pydrive.auth import GoogleAuth
-from pydrive.drive import GoogleDrive
 from sentence_transformers import SentenceTransformer
 # Constants
 DRIVE_FILE_LINK = "https://drive.google.com/file/d/1kYGomSibXW-wCFptEMcWP12jOz1390OK/view?usp=drive_link"
 GROQ_MODEL = "llama-3.3-70b-versatile"
-# Authentication and setup for Google Drive
-@st.cache_resource
-def load_drive_content(file_link):
     gauth = GoogleAuth()
     gauth.LocalWebserverAuth()
     drive = GoogleDrive(gauth)
-    file_id = file_link.split('/d/')[1].split('/view')[0]
-    downloaded_file = drive.CreateFile({'id': file_id})
     downloaded_file.GetContentFile("document.pdf")
     return "document.pdf"
-# Chunking and embedding creation
-@st.cache_resource
-def prepare_embeddings(document_path):
-    from PyPDF2 import PdfReader
-    reader = PdfReader(document_path)
-    text = ""
-    for page in reader.pages:
-        text += page.extract_text()
-    # Create chunks of 500 characters with a sliding window of 200
-    chunk_size = 500
-    chunk_overlap = 200
-    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - chunk_overlap)]
-    # Embedding model
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
-    embeddings = embedder.encode(chunks, convert_to_tensor=True).detach().numpy()
-    # Store in FAISS
     vector_dim = embeddings.shape[1]
     index = faiss.IndexFlatL2(vector_dim)
     index.add(embeddings)
-    return chunks, index
-# Groq setup
-@st.cache_resource
-def groq_client():
-    return Groq(api_key=os.environ.get("GROQ_API_KEY"))
-# Retrieve and query vector DB
 def query_vector_db(query, chunks, index, embedder):
     query_embedding = embedder.encode([query], convert_to_tensor=True).detach().numpy()
-    D, I = index.search(query_embedding, k=1)  # Find top result
-    if I[0][0] != -1:  # Valid match
         return chunks[I[0][0]]
     return "No relevant content found."
-# Streamlit application
 def main():
     st.title("RAG-based Application with Groq")
-    # Load document and prepare FAISS
-    st.info("Loading document and preparing FAISS...")
-    document_path = load_drive_content(DRIVE_FILE_LINK)
-    chunks, index = prepare_embeddings(document_path)
-    embedder = SentenceTransformer("all-MiniLM-L6-v2")
-    client = groq_client()
-    # Interface
     user_input = st.text_input("Enter your query:")
     if user_input:
-        context = query_vector_db(user_input, chunks, index, embedder)
-        st.write("**Relevant Context:**", context)
-        # Query Groq model
-        with st.spinner("Querying Groq model..."):
             chat_completion = client.chat.completions.create(
                 messages=[
                     {"role": "user", "content": f"Based on this context: {context}, {user_input}"}

 import numpy as np
 import faiss
 from groq import Groq
 from sentence_transformers import SentenceTransformer
+from PyPDF2 import PdfReader
 # Constants
 DRIVE_FILE_LINK = "https://drive.google.com/file/d/1kYGomSibXW-wCFptEMcWP12jOz1390OK/view?usp=drive_link"
 GROQ_MODEL = "llama-3.3-70b-versatile"
+# Download the document
+def download_document(file_link):
+    from pydrive.auth import GoogleAuth
+    from pydrive.drive import GoogleDrive
+    st.info("Authenticating with Google Drive...")
     gauth = GoogleAuth()
     gauth.LocalWebserverAuth()
     drive = GoogleDrive(gauth)
+    file_id = file_link.split("/d/")[1].split("/view")[0]
+    downloaded_file = drive.CreateFile({"id": file_id})
     downloaded_file.GetContentFile("document.pdf")
     return "document.pdf"
+# Chunk the text
+def chunk_text(text, chunk_size=500, chunk_overlap=200):
+    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - chunk_overlap)]
+    return chunks
+# Create embeddings and store in FAISS
+def create_vector_database(chunks):
+    st.info("Creating embeddings...")
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    embeddings = []
+    # Process embeddings in batches
+    for i in range(0, len(chunks), 100):
+        batch = chunks[i:i+100]
+        embeddings.append(embedder.encode(batch, convert_to_tensor=True).detach().numpy())
+    embeddings = np.vstack(embeddings)
+    st.info("Initializing FAISS vector database...")
     vector_dim = embeddings.shape[1]
     index = faiss.IndexFlatL2(vector_dim)
     index.add(embeddings)
+    return index
+# Query the vector database
 def query_vector_db(query, chunks, index, embedder):
     query_embedding = embedder.encode([query], convert_to_tensor=True).detach().numpy()
+    D, I = index.search(query_embedding, k=1)  # Top 1 match
+    if I[0][0] != -1:
         return chunks[I[0][0]]
     return "No relevant content found."
+# Main Streamlit App
 def main():
     st.title("RAG-based Application with Groq")
+    # Step 1: Load Document
+    if st.button("Download and Load Document"):
+        document_path = download_document(DRIVE_FILE_LINK)
+        reader = PdfReader(document_path)
+        text = "".join([page.extract_text() for page in reader.pages])
+        chunks = chunk_text(text)
+        st.success("Document loaded and chunked!")
+        st.session_state["chunks"] = chunks
+    # Step 2: Create Vector Database
+    if st.button("Create Vector Database"):
+        if "chunks" not in st.session_state:
+            st.error("Please load the document first!")
+        else:
+            index = create_vector_database(st.session_state["chunks"])
+            st.session_state["index"] = index
+            st.success("Vector database created successfully!")
+    # Step 3: Query
     user_input = st.text_input("Enter your query:")
     if user_input:
+        if "index" not in st.session_state or "chunks" not in st.session_state:
+            st.error("Please load the document and create the vector database first!")
+        else:
+            embedder = SentenceTransformer("all-MiniLM-L6-v2")
+            context = query_vector_db(user_input, st.session_state["chunks"], st.session_state["index"], embedder)
+            st.write("**Relevant Context:**", context)
+            # Query Groq model
+            client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+            st.info("Querying Groq model...")
             chat_completion = client.chat.completions.create(
                 messages=[
                     {"role": "user", "content": f"Based on this context: {context}, {user_input}"}