Spaces:

khababakhtar
/

Load-Balancing-App

Sleeping

App Files Files Community

khababakhtar commited on Jan 1, 2025

Commit

0b21087

verified ·

1 Parent(s): 6c61daf

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -59

app.py CHANGED Viewed

@@ -1,78 +1,84 @@
 import os
-import re
-import tempfile
-import pytesseract
-from pdf2image import convert_from_path
 import numpy as np
 import faiss
-from groq import Groq
 import requests
 import streamlit as st
-# Initialize Groq client
-groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-# Function to download and process Google Drive PDF
-def extract_text_from_pdf(download_url):
-    response = requests.get(download_url)
-    if response.status_code == 200:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
-            temp_pdf.write(response.content)
-            temp_pdf.close()
-            images = convert_from_path(temp_pdf.name)
-            text = ""
-            for image in images:
-                text += pytesseract.image_to_string(image)
-            return text
-    else:
-        raise ValueError("Failed to download the PDF from the provided link.")
-# Preprocess text into chunks
-def preprocess_text(text, chunk_size=512):
-    text = re.sub(r"\s+", " ", text)
     words = text.split()
-    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
     return chunks
-# Store chunks in FAISS
 def store_chunks_in_faiss(chunks):
-    vector_dim = 768  # Assume embeddings are 768-dimensional
     index = faiss.IndexFlatL2(vector_dim)
-    embeddings = np.random.rand(len(chunks), vector_dim).astype("float32")  # Dummy embeddings
     index.add(embeddings)
     return index
-# Query Groq API
-def query_groq_model(prompt):
-    chat_completion = groq_client.chat.completions.create(
-        messages=[{"role": "user", "content": prompt}],
-        model="llama-3.3-70b-versatile",
-    )
-    return chat_completion.choices[0].message.content
-# Streamlit frontend
-st.title("RAG-Based Application")
-drive_url = st.text_input("Enter Google Drive File URL:")
-query = st.text_input("Enter your query:")
-if st.button("Process"):
-    if drive_url and query:
-        try:
-            # Extract file ID from Google Drive URL
-            file_id = drive_url.split("/d/")[1].split("/")[0]
-            download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
-            with st.spinner("Processing document..."):
-                document_text = extract_text_from_pdf(download_url)
-                chunks = preprocess_text(document_text)
-                index = store_chunks_in_faiss(chunks)
-                st.success("Document processed and stored in vector database.")
-            with st.spinner("Querying model..."):
-                response = query_groq_model(query)
-                st.write("Model Response:")
-                st.write(response)
-        except Exception as e:
-            st.error(f"An error occurred: {e}")
     else:
-        st.error("Please provide both Google Drive File URL and query.")

 import os
 import numpy as np
 import faiss
+import pytesseract
+from pdf2image import convert_from_path
 import requests
 import streamlit as st
+from groq import Groq
+# Set up Groq client
+client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_path):
+    images = convert_from_path(pdf_path)
+    text = ""
+    for page in images:
+        text += pytesseract.image_to_string(page)
+    return text
+# Function to chunk the text
+def create_chunks(text, chunk_size=200):
     words = text.split()
+    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
     return chunks
+# Function to store chunks in FAISS (GPU enabled)
 def store_chunks_in_faiss(chunks):
+    vector_dim = 768  # Assuming embeddings are 768-dimensional
     index = faiss.IndexFlatL2(vector_dim)
+    # Move index to GPU if available
+    res = faiss.StandardGpuResources()
+    index = faiss.index_cpu_to_gpu(res, 0, index)
+    # Generate dummy embeddings for demonstration
+    embeddings = np.random.rand(len(chunks), vector_dim).astype("float32")
     index.add(embeddings)
     return index
+# Check if FAISS is using GPU
+def is_gpu_available():
+    return faiss.get_num_gpus() > 0
+# Streamlit app interface
+st.title("PDF Content Chunking and Retrieval with FAISS-GPU")
+# PDF upload
+uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+if uploaded_file:
+    st.write("Processing the uploaded file...")
+    with open("uploaded_file.pdf", "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    # Extract text
+    extracted_text = extract_text_from_pdf("uploaded_file.pdf")
+    st.text_area("Extracted Text", extracted_text, height=200)
+    # Chunk text
+    st.write("Creating chunks...")
+    chunks = create_chunks(extracted_text)
+    st.write(f"Total chunks created: {len(chunks)}")
+    # Store chunks in FAISS
+    st.write("Storing chunks in FAISS...")
+    index = store_chunks_in_faiss(chunks)
+    if is_gpu_available():
+        st.success("FAISS is using GPU resources!")
     else:
+        st.warning("FAISS is running on CPU.")
+    st.write("Chunks successfully stored in the FAISS index!")
+# Interaction with Groq
+user_input = st.text_input("Ask a question about the content:")
+if user_input:
+    st.write("Sending query to Groq API...")
+    response = client.chat.completions.create(
+        messages=[{"role": "user", "content": user_input}],
+        model="llama-3.3-70b-versatile"
+    )
+    st.text_area("Groq API Response", response.choices[0].message.content, height=100)