Spaces:

TANVEERMAKHDOOM
/

Demo-Rag-based-app-doc

Sleeping

App Files Files Community

TANVEERMAKHDOOM commited on May 7, 2025

Commit

4bf11bb

verified ·

1 Parent(s): fd06bec

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -39

app.py CHANGED Viewed

@@ -1,23 +1,19 @@
 import os
 import requests
-from groq import Groq
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.vectorstores import FAISS
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from PyPDF2 import PdfReader
-import streamlit as st
 from tempfile import NamedTemporaryFile
-# Set Groq API key (use Secrets in Hugging Face)
-GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-# Check for API key
-if not GROQ_API_KEY:
-    st.error("Please set the GROQ_API_KEY in the Hugging Face Space secrets.")
-    st.stop()
-client = Groq(api_key=GROQ_API_KEY)
 def extract_drive_file_id(url):
     if "drive.google.com" in url:
         parts = url.split("/file/d/")
@@ -25,25 +21,20 @@ def extract_drive_file_id(url):
             return parts[1].split("/")[0]
     return None
-def get_direct_download_link(view_url):
-    file_id = extract_drive_file_id(view_url)
-    if file_id:
-        return f"https://drive.google.com/uc?export=download&id={file_id}"
-    return None
 def download_pdf_from_url(url):
-    direct_url = get_direct_download_link(url)
-    if not direct_url:
         return None
-    response = requests.get(direct_url, allow_redirects=True)
-    if response.status_code == 200:
-        temp_file = NamedTemporaryFile(delete=False, suffix=".pdf")
-        temp_file.write(response.content)
-        temp_file.close()
-        return temp_file.name
-    else:
         return None
 def extract_text_from_pdf(pdf_file_path):
     pdf_reader = PdfReader(pdf_file_path)
     text = ""
@@ -53,12 +44,14 @@ def extract_text_from_pdf(pdf_file_path):
             text += page_text
     return text
 def chunk_text(text, chunk_size=500, chunk_overlap=50):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=chunk_size, chunk_overlap=chunk_overlap
     )
     return text_splitter.split_text(text)
 def create_embeddings_and_store(chunks, vector_db=None):
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     if vector_db is None:
@@ -67,6 +60,7 @@ def create_embeddings_and_store(chunks, vector_db=None):
         vector_db.add_texts(chunks)
     return vector_db
 def query_vector_db(query, vector_db):
     docs = vector_db.similarity_search(query, k=3)
     context = "\n".join([doc.page_content for doc in docs])
@@ -79,10 +73,10 @@ def query_vector_db(query, vector_db):
     )
     return chat_completion.choices[0].message.content
-# --- Streamlit UI ---
-st.set_page_config(page_title="RAG on Google Drive PDFs")
-st.title("📄 RAG-Based QA on Auto-Fetched Google Drive PDFs")
 doc_links = [
     "https://drive.google.com/file/d/0B9Ivs2CdbN04bmJhZGl3Z0VhUHc/view?usp=sharing&resourcekey=0-VGasMdtr3imjqp-Go6TrhA",
     "https://drive.google.com/file/d/0B9Ivs2CdbN04V3VhNUFrVk40M2M/view?usp=sharing&resourcekey=0-VIv15q5jcFFA6t6F45g13Q",
@@ -90,21 +84,26 @@ doc_links = [
 vector_db = None
 for idx, link in enumerate(doc_links):
-    st.write(f"🔄 Processing PDF from Link {idx + 1}...")
     pdf_path = download_pdf_from_url(link)
     if pdf_path:
-        text = extract_text_from_pdf(pdf_path)
-        chunks = chunk_text(text)
-        vector_db = create_embeddings_and_store(chunks, vector_db=vector_db)
-        st.success(f"✅ Document {idx + 1} processed.")
     else:
-        st.error(f"❌ Could not fetch document {idx + 1}.")
-user_query = st.text_input("🔍 Ask a question about the documents:")
 if user_query and vector_db:
     response = query_vector_db(user_query, vector_db)
     st.subheader("💬 Answer:")
     st.write(response)
 elif user_query:
-    st.warning("⚠️ No documents available to query yet.")

 import os
+import gdown
+import streamlit as st
 import requests
 from PyPDF2 import PdfReader
 from tempfile import NamedTemporaryFile
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from groq import Groq
+# Initialize Groq client
+client = Groq(api_key=os.environ['GROQ_API_KEY'])
+# Function to extract file ID from Google Drive URL
 def extract_drive_file_id(url):
     if "drive.google.com" in url:
         parts = url.split("/file/d/")
             return parts[1].split("/")[0]
     return None
+# Download and save PDF from Google Drive using gdown
 def download_pdf_from_url(url):
+    file_id = extract_drive_file_id(url)
+    if not file_id:
         return None
+    output_path = f"/tmp/{file_id}.pdf"
+    try:
+        gdown.download(id=file_id, output=output_path, quiet=False)
+        return output_path
+    except Exception as e:
+        print(f"Download failed: {e}")
         return None
+# Extract text from PDF
 def extract_text_from_pdf(pdf_file_path):
     pdf_reader = PdfReader(pdf_file_path)
     text = ""
             text += page_text
     return text
+# Split text into chunks
 def chunk_text(text, chunk_size=500, chunk_overlap=50):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=chunk_size, chunk_overlap=chunk_overlap
     )
     return text_splitter.split_text(text)
+# Create and update FAISS vector DB
 def create_embeddings_and_store(chunks, vector_db=None):
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     if vector_db is None:
         vector_db.add_texts(chunks)
     return vector_db
+# Query the database and get response from Groq LLM
 def query_vector_db(query, vector_db):
     docs = vector_db.similarity_search(query, k=3)
     context = "\n".join([doc.page_content for doc in docs])
     )
     return chat_completion.choices[0].message.content
+# Streamlit UI
+st.title("📄 RAG QA on Google Drive PDFs (Auto-Fetch)")
+# Public Google Drive PDF links
 doc_links = [
     "https://drive.google.com/file/d/0B9Ivs2CdbN04bmJhZGl3Z0VhUHc/view?usp=sharing&resourcekey=0-VGasMdtr3imjqp-Go6TrhA",
     "https://drive.google.com/file/d/0B9Ivs2CdbN04V3VhNUFrVk40M2M/view?usp=sharing&resourcekey=0-VIv15q5jcFFA6t6F45g13Q",
 vector_db = None
+# Auto-fetch and process each PDF
 for idx, link in enumerate(doc_links):
+    st.write(f"📥 Fetching and processing PDF {idx + 1}...")
     pdf_path = download_pdf_from_url(link)
     if pdf_path:
+        try:
+            text = extract_text_from_pdf(pdf_path)
+            chunks = chunk_text(text)
+            vector_db = create_embeddings_and_store(chunks, vector_db=vector_db)
+            st.success(f"✅ Successfully processed document {idx + 1}")
+        except Exception as e:
+            st.error(f"❌ Error processing document {idx + 1}: {e}")
     else:
+        st.error(f"❌ Failed to download document {idx + 1}")
+# User input for query
+user_query = st.text_input("🔍 Enter your query:")
 if user_query and vector_db:
     response = query_vector_db(user_query, vector_db)
     st.subheader("💬 Answer:")
     st.write(response)
 elif user_query:
+    st.warning("⚠️ No documents available to query.")