TANVEERMAKHDOOM commited on
Commit
4bf11bb
·
verified ·
1 Parent(s): fd06bec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -39
app.py CHANGED
@@ -1,23 +1,19 @@
1
  import os
 
 
2
  import requests
3
- from groq import Groq
4
- from langchain_community.embeddings import HuggingFaceEmbeddings
5
- from langchain_community.vectorstores import FAISS
6
- from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from PyPDF2 import PdfReader
8
- import streamlit as st
9
  from tempfile import NamedTemporaryFile
10
 
11
- # Set Groq API key (use Secrets in Hugging Face)
12
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
13
-
14
- # Check for API key
15
- if not GROQ_API_KEY:
16
- st.error("Please set the GROQ_API_KEY in the Hugging Face Space secrets.")
17
- st.stop()
18
 
19
- client = Groq(api_key=GROQ_API_KEY)
 
20
 
 
21
  def extract_drive_file_id(url):
22
  if "drive.google.com" in url:
23
  parts = url.split("/file/d/")
@@ -25,25 +21,20 @@ def extract_drive_file_id(url):
25
  return parts[1].split("/")[0]
26
  return None
27
 
28
- def get_direct_download_link(view_url):
29
- file_id = extract_drive_file_id(view_url)
30
- if file_id:
31
- return f"https://drive.google.com/uc?export=download&id={file_id}"
32
- return None
33
-
34
  def download_pdf_from_url(url):
35
- direct_url = get_direct_download_link(url)
36
- if not direct_url:
37
  return None
38
- response = requests.get(direct_url, allow_redirects=True)
39
- if response.status_code == 200:
40
- temp_file = NamedTemporaryFile(delete=False, suffix=".pdf")
41
- temp_file.write(response.content)
42
- temp_file.close()
43
- return temp_file.name
44
- else:
45
  return None
46
 
 
47
  def extract_text_from_pdf(pdf_file_path):
48
  pdf_reader = PdfReader(pdf_file_path)
49
  text = ""
@@ -53,12 +44,14 @@ def extract_text_from_pdf(pdf_file_path):
53
  text += page_text
54
  return text
55
 
 
56
  def chunk_text(text, chunk_size=500, chunk_overlap=50):
57
  text_splitter = RecursiveCharacterTextSplitter(
58
  chunk_size=chunk_size, chunk_overlap=chunk_overlap
59
  )
60
  return text_splitter.split_text(text)
61
 
 
62
  def create_embeddings_and_store(chunks, vector_db=None):
63
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
64
  if vector_db is None:
@@ -67,6 +60,7 @@ def create_embeddings_and_store(chunks, vector_db=None):
67
  vector_db.add_texts(chunks)
68
  return vector_db
69
 
 
70
  def query_vector_db(query, vector_db):
71
  docs = vector_db.similarity_search(query, k=3)
72
  context = "\n".join([doc.page_content for doc in docs])
@@ -79,10 +73,10 @@ def query_vector_db(query, vector_db):
79
  )
80
  return chat_completion.choices[0].message.content
81
 
82
- # --- Streamlit UI ---
83
- st.set_page_config(page_title="RAG on Google Drive PDFs")
84
- st.title("📄 RAG-Based QA on Auto-Fetched Google Drive PDFs")
85
 
 
86
  doc_links = [
87
  "https://drive.google.com/file/d/0B9Ivs2CdbN04bmJhZGl3Z0VhUHc/view?usp=sharing&resourcekey=0-VGasMdtr3imjqp-Go6TrhA",
88
  "https://drive.google.com/file/d/0B9Ivs2CdbN04V3VhNUFrVk40M2M/view?usp=sharing&resourcekey=0-VIv15q5jcFFA6t6F45g13Q",
@@ -90,21 +84,26 @@ doc_links = [
90
 
91
  vector_db = None
92
 
 
93
  for idx, link in enumerate(doc_links):
94
- st.write(f"🔄 Processing PDF from Link {idx + 1}...")
95
  pdf_path = download_pdf_from_url(link)
96
  if pdf_path:
97
- text = extract_text_from_pdf(pdf_path)
98
- chunks = chunk_text(text)
99
- vector_db = create_embeddings_and_store(chunks, vector_db=vector_db)
100
- st.success(f"✅ Document {idx + 1} processed.")
 
 
 
101
  else:
102
- st.error(f"❌ Could not fetch document {idx + 1}.")
103
 
104
- user_query = st.text_input("🔍 Ask a question about the documents:")
 
105
  if user_query and vector_db:
106
  response = query_vector_db(user_query, vector_db)
107
  st.subheader("💬 Answer:")
108
  st.write(response)
109
  elif user_query:
110
- st.warning("⚠️ No documents available to query yet.")
 
1
  import os
2
+ import gdown
3
+ import streamlit as st
4
  import requests
 
 
 
 
5
  from PyPDF2 import PdfReader
 
6
  from tempfile import NamedTemporaryFile
7
 
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from langchain_community.vectorstores import FAISS
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from groq import Groq
 
 
 
12
 
13
+ # Initialize Groq client
14
+ client = Groq(api_key=os.environ['GROQ_API_KEY'])
15
 
16
+ # Function to extract file ID from Google Drive URL
17
  def extract_drive_file_id(url):
18
  if "drive.google.com" in url:
19
  parts = url.split("/file/d/")
 
21
  return parts[1].split("/")[0]
22
  return None
23
 
24
+ # Download and save PDF from Google Drive using gdown
 
 
 
 
 
25
  def download_pdf_from_url(url):
26
+ file_id = extract_drive_file_id(url)
27
+ if not file_id:
28
  return None
29
+ output_path = f"/tmp/{file_id}.pdf"
30
+ try:
31
+ gdown.download(id=file_id, output=output_path, quiet=False)
32
+ return output_path
33
+ except Exception as e:
34
+ print(f"Download failed: {e}")
 
35
  return None
36
 
37
+ # Extract text from PDF
38
  def extract_text_from_pdf(pdf_file_path):
39
  pdf_reader = PdfReader(pdf_file_path)
40
  text = ""
 
44
  text += page_text
45
  return text
46
 
47
+ # Split text into chunks
48
  def chunk_text(text, chunk_size=500, chunk_overlap=50):
49
  text_splitter = RecursiveCharacterTextSplitter(
50
  chunk_size=chunk_size, chunk_overlap=chunk_overlap
51
  )
52
  return text_splitter.split_text(text)
53
 
54
+ # Create and update FAISS vector DB
55
  def create_embeddings_and_store(chunks, vector_db=None):
56
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
57
  if vector_db is None:
 
60
  vector_db.add_texts(chunks)
61
  return vector_db
62
 
63
+ # Query the database and get response from Groq LLM
64
  def query_vector_db(query, vector_db):
65
  docs = vector_db.similarity_search(query, k=3)
66
  context = "\n".join([doc.page_content for doc in docs])
 
73
  )
74
  return chat_completion.choices[0].message.content
75
 
76
+ # Streamlit UI
77
+ st.title("📄 RAG QA on Google Drive PDFs (Auto-Fetch)")
 
78
 
79
+ # Public Google Drive PDF links
80
  doc_links = [
81
  "https://drive.google.com/file/d/0B9Ivs2CdbN04bmJhZGl3Z0VhUHc/view?usp=sharing&resourcekey=0-VGasMdtr3imjqp-Go6TrhA",
82
  "https://drive.google.com/file/d/0B9Ivs2CdbN04V3VhNUFrVk40M2M/view?usp=sharing&resourcekey=0-VIv15q5jcFFA6t6F45g13Q",
 
84
 
85
  vector_db = None
86
 
87
+ # Auto-fetch and process each PDF
88
  for idx, link in enumerate(doc_links):
89
+ st.write(f"📥 Fetching and processing PDF {idx + 1}...")
90
  pdf_path = download_pdf_from_url(link)
91
  if pdf_path:
92
+ try:
93
+ text = extract_text_from_pdf(pdf_path)
94
+ chunks = chunk_text(text)
95
+ vector_db = create_embeddings_and_store(chunks, vector_db=vector_db)
96
+ st.success(f"✅ Successfully processed document {idx + 1}")
97
+ except Exception as e:
98
+ st.error(f"❌ Error processing document {idx + 1}: {e}")
99
  else:
100
+ st.error(f"❌ Failed to download document {idx + 1}")
101
 
102
+ # User input for query
103
+ user_query = st.text_input("🔍 Enter your query:")
104
  if user_query and vector_db:
105
  response = query_vector_db(user_query, vector_db)
106
  st.subheader("💬 Answer:")
107
  st.write(response)
108
  elif user_query:
109
+ st.warning("⚠️ No documents available to query.")