Spaces:

RizwanSajad
/

Electrical_RAG_Based_App

Runtime error

App Files Files Community

RizwanSajad commited on Jan 2, 2025

Commit

73b110f

verified ·

1 Parent(s): 3166986

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -75

app.py CHANGED Viewed

@@ -1,94 +1,91 @@
 import os
-import json
-import faiss
-import numpy as np
-import PyPDF2
-import requests
 import streamlit as st
 from groq import Groq
 # Constants
-PDF_URL = "https://drive.google.com/uc?export=download&id=1YWX-RYxgtcKO1QETnz1N3rboZUhRZwcH"
-VECTOR_DIM = 768
-CHUNK_SIZE = 512
-# Function to download and extract text from the PDF
-def extract_text_from_pdf(url):
-    response = requests.get(url)
-    with open("document.pdf", "wb") as f:
-        f.write(response.content)
-    with open("document.pdf", "rb") as f:
-        reader = PyPDF2.PdfReader(f)
-        text = "\n".join(page.extract_text() for page in reader.pages)
-    return text
-# Function to split text into chunks
-def create_chunks(text, chunk_size):
-    words = text.split()
-    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
-    return chunks
-# Function to create FAISS vector store
-def create_faiss_index(chunks, vector_dim):
-    # Check if GPU is available and use it
-    if faiss.get_num_gpus() > 0:
-        st.write("Using GPU for FAISS indexing.")
-        resource = faiss.StandardGpuResources()  # Initialize GPU resources
-        index_flat = faiss.IndexFlatL2(vector_dim)
-        index = faiss.index_cpu_to_gpu(resource, 0, index_flat)
-    else:
-        st.write("Using CPU for FAISS indexing.")
-        index = faiss.IndexFlatL2(vector_dim)
-    embeddings = np.random.rand(len(chunks), vector_dim).astype('float32')  # Replace with real embeddings
     index.add(embeddings)
-    return index, embeddings
-# Initialize Groq API client
-def get_groq_client():
-    return os.environ.get("GROQ_API_KEY")
-# Query Groq model
-def query_model(client, question):
-    chat_completion = client.chat.completions.create(
-        messages=[{"role": "user", "content": question}],
-        model="llama-3.3-70b-versatile",
-    )
-    return chat_completion.choices[0].message.content
-# Streamlit app
 def main():
-    st.title("RAG-Based Application")
-    # Step 1: Extract text from the document
-    st.header("Step 1: Extract Text")
-    if st.button("Extract Text from PDF"):
-        text = extract_text_from_pdf(PDF_URL)
-        st.session_state["text"] = text
-        st.success("Text extracted successfully!")
-    # Step 2: Chunk the text
-    st.header("Step 2: Create Chunks")
-    if "text" in st.session_state and st.button("Create Chunks"):
-        chunks = create_chunks(st.session_state["text"], CHUNK_SIZE)
-        st.session_state["chunks"] = chunks
-        st.success(f"Created {len(chunks)} chunks.")
-    # Step 3: Create FAISS index
-    st.header("Step 3: Create Vector Database")
-    if "chunks" in st.session_state and st.button("Create Vector Database"):
-        index, embeddings = create_faiss_index(st.session_state["chunks"], VECTOR_DIM)
-        st.session_state["index"] = index
-        st.success("FAISS vector database created.")
-    # Step 4: Ask a question
-    st.header("Step 4: Query the Model")
-    question = st.text_input("Ask a question about the document:")
-    if question and "index" in st.session_state:
-        client = get_groq_client()
-        answer = query_model(client, question)
-        st.write("Answer:", answer)
 if __name__ == "__main__":
     main()

 import os
 import streamlit as st
+import numpy as np
+import faiss
 from groq import Groq
+from pydrive.auth import GoogleAuth
+from pydrive.drive import GoogleDrive
+from sentence_transformers import SentenceTransformer
 # Constants
+DRIVE_FILE_LINK = "https://drive.google.com/file/d/1kYGomSibXW-wCFptEMcWP12jOz1390OK/view?usp=drive_link"
+GROQ_MODEL = "llama-3.3-70b-versatile"
+# Authentication and setup for Google Drive
+@st.cache_resource
+def load_drive_content(file_link):
+    gauth = GoogleAuth()
+    gauth.LocalWebserverAuth()
+    drive = GoogleDrive(gauth)
+    file_id = file_link.split('/d/')[1].split('/view')[0]
+    downloaded_file = drive.CreateFile({'id': file_id})
+    downloaded_file.GetContentFile("document.pdf")
+    return "document.pdf"
+# Chunking and embedding creation
+@st.cache_resource
+def prepare_embeddings(document_path):
+    from PyPDF2 import PdfReader
+    reader = PdfReader(document_path)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    # Create chunks of 500 characters with a sliding window of 200
+    chunk_size = 500
+    chunk_overlap = 200
+    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - chunk_overlap)]
+    # Embedding model
+    embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    embeddings = embedder.encode(chunks, convert_to_tensor=True).detach().numpy()
+    # Store in FAISS
+    vector_dim = embeddings.shape[1]
+    index = faiss.IndexFlatL2(vector_dim)
     index.add(embeddings)
+    return chunks, index
+# Groq setup
+@st.cache_resource
+def groq_client():
+    return Groq(api_key=os.environ.get("GROQ_API_KEY"))
+# Retrieve and query vector DB
+def query_vector_db(query, chunks, index, embedder):
+    query_embedding = embedder.encode([query], convert_to_tensor=True).detach().numpy()
+    D, I = index.search(query_embedding, k=1)  # Find top result
+    if I[0][0] != -1:  # Valid match
+        return chunks[I[0][0]]
+    return "No relevant content found."
+# Streamlit application
 def main():
+    st.title("RAG-based Application with Groq")
+    # Load document and prepare FAISS
+    st.info("Loading document and preparing FAISS...")
+    document_path = load_drive_content(DRIVE_FILE_LINK)
+    chunks, index = prepare_embeddings(document_path)
+    embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    client = groq_client()
+    # Interface
+    user_input = st.text_input("Enter your query:")
+    if user_input:
+        context = query_vector_db(user_input, chunks, index, embedder)
+        st.write("**Relevant Context:**", context)
+        # Query Groq model
+        with st.spinner("Querying Groq model..."):
+            chat_completion = client.chat.completions.create(
+                messages=[
+                    {"role": "user", "content": f"Based on this context: {context}, {user_input}"}
+                ],
+                model=GROQ_MODEL,
+            )
+            st.write("**Groq Model Response:**", chat_completion.choices[0].message.content)
 if __name__ == "__main__":
     main()