TANVEERMAKHDOOM commited on
Commit
f9a9ebf
·
verified ·
1 Parent(s): 96abe9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -59
app.py CHANGED
@@ -10,87 +10,74 @@ from groq import Groq
10
  # Initialize Groq client
11
  client = Groq(api_key=os.environ['GROQ_API_KEY'])
12
 
13
- # Download and save PDF using gdown (fuzzy handles complex links)
14
- def download_pdf_from_url(url, idx):
15
- output_path = f"/tmp/doc_{idx}.pdf"
16
  try:
17
- gdown.download(url=url, output=output_path, quiet=False, fuzzy=True)
18
  return output_path
19
  except Exception as e:
20
- print(f"Download failed: {e}")
21
  return None
22
 
23
  # Extract text from PDF
24
- def extract_text_from_pdf(pdf_file_path):
25
- pdf_reader = PdfReader(pdf_file_path)
26
  text = ""
27
- for page in pdf_reader.pages:
28
- page_text = page.extract_text()
29
- if page_text:
30
- text += page_text
31
  return text
32
 
33
  # Split text into chunks
34
- def chunk_text(text, chunk_size=500, chunk_overlap=50):
35
- text_splitter = RecursiveCharacterTextSplitter(
36
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
37
- )
38
- return text_splitter.split_text(text)
39
 
40
- # Create and update FAISS vector DB
41
- def create_embeddings_and_store(chunks, vector_db=None):
42
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
43
- if vector_db is None:
44
- vector_db = FAISS.from_texts(chunks, embedding=embeddings)
45
- else:
46
- vector_db.add_texts(chunks)
47
- return vector_db
48
 
49
- # Query the database and get response from Groq LLM
50
- def query_vector_db(query, vector_db):
51
  docs = vector_db.similarity_search(query, k=3)
52
  context = "\n".join([doc.page_content for doc in docs])
53
- chat_completion = client.chat.completions.create(
 
54
  messages=[
55
  {"role": "system", "content": f"Use the following context:\n{context}"},
56
  {"role": "user", "content": query},
57
- ],
58
- model="llama3-8b-8192",
59
  )
60
- return chat_completion.choices[0].message.content
61
 
62
- # Streamlit UI
63
- st.title("📄 RAG QA on Google Drive PDFs (Auto-Fetch)")
64
 
65
- # Public Google Drive PDF links
66
- doc_links = [
67
- "https://drive.google.com/file/d/0B9Ivs2CdbN04bmJhZGl3Z0VhUHc/view?usp=sharing&resourcekey=0-VGasMdtr3imjqp-Go6TrhA",
68
- "https://drive.google.com/file/d/0B9Ivs2CdbN04V3VhNUFrVk40M2M/view?usp=sharing&resourcekey=0-VIv15q5jcFFA6t6F45g13Q",
69
- ]
70
 
71
- vector_db = None
72
 
73
- # Auto-fetch and process each PDF
74
- for idx, link in enumerate(doc_links):
75
- st.write(f"📥 Fetching and processing PDF {idx + 1}...")
76
- pdf_path = download_pdf_from_url(link, idx)
77
- if pdf_path:
78
- try:
79
- text = extract_text_from_pdf(pdf_path)
80
- chunks = chunk_text(text)
81
- vector_db = create_embeddings_and_store(chunks, vector_db=vector_db)
82
- st.success(f"✅ Successfully processed document {idx + 1}")
83
- except Exception as e:
84
- st.error(f"❌ Error processing document {idx + 1}: {e}")
85
- else:
86
- st.error(f"❌ Failed to download document {idx + 1}")
87
 
88
- # User input for query
89
- user_query = st.text_input("🔍 Enter your query:")
90
- if user_query and vector_db:
91
- response = query_vector_db(user_query, vector_db)
92
- st.subheader("💬 Answer:")
93
- st.write(response)
94
- elif user_query:
95
- st.warning("⚠️ No documents available to query.")
 
 
 
96
 
 
 
 
 
 
 
 
 
10
  # Initialize Groq client
11
  client = Groq(api_key=os.environ['GROQ_API_KEY'])
12
 
13
+ # Download and save PDF using gdown
14
+ def download_pdf(url):
15
+ output_path = "/tmp/drive_doc.pdf"
16
  try:
17
+ gdown.download(url=url, output=output_path, quiet=True, fuzzy=True)
18
  return output_path
19
  except Exception as e:
20
+ st.error(f"Download failed: {e}")
21
  return None
22
 
23
  # Extract text from PDF
24
+ def extract_text(pdf_path):
25
+ reader = PdfReader(pdf_path)
26
  text = ""
27
+ for page in reader.pages:
28
+ content = page.extract_text()
29
+ if content:
30
+ text += content
31
  return text
32
 
33
  # Split text into chunks
34
+ def chunk_text(text):
35
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
36
+ return splitter.split_text(text)
 
 
37
 
38
+ # Create embeddings and store in FAISS
39
+ def build_vector_db(chunks):
40
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
41
+ return FAISS.from_texts(chunks, embedding=embeddings)
 
 
 
 
42
 
43
+ # Query the vector DB and get response from Groq
44
+ def query_groq(query, vector_db):
45
  docs = vector_db.similarity_search(query, k=3)
46
  context = "\n".join([doc.page_content for doc in docs])
47
+ response = client.chat.completions.create(
48
+ model="llama3-8b-8192",
49
  messages=[
50
  {"role": "system", "content": f"Use the following context:\n{context}"},
51
  {"role": "user", "content": query},
52
+ ]
 
53
  )
54
+ return response.choices[0].message.content
55
 
56
+ # --- Streamlit App ---
 
57
 
58
+ st.title("📄 RAG QA from Google Drive PDF")
 
 
 
 
59
 
60
+ link = "https://drive.google.com/file/d/1SGXNLO841VyHnGiX81oo6x2RHIrTmP5S/view?usp=sharing"
61
 
62
+ st.write("📥 Downloading and processing document...")
63
+ pdf_path = download_pdf(link)
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ if pdf_path:
66
+ try:
67
+ text = extract_text(pdf_path)
68
+ chunks = chunk_text(text)
69
+ vector_db = build_vector_db(chunks)
70
+ st.success("✅ Document processed successfully.")
71
+ except Exception as e:
72
+ st.error(f" Error processing PDF: {e}")
73
+ vector_db = None
74
+ else:
75
+ vector_db = None
76
 
77
+ query = st.text_input("🔍 Enter your query:")
78
+ if query and vector_db:
79
+ answer = query_groq(query, vector_db)
80
+ st.subheader("💬 Answer:")
81
+ st.write(answer)
82
+ elif query:
83
+ st.warning("⚠️ Document not ready yet.")