TANVEERMAKHDOOM commited on
Commit
96abe9d
·
verified ·
1 Parent(s): ffa725f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -19
app.py CHANGED
@@ -1,10 +1,7 @@
1
  import os
2
  import gdown
3
  import streamlit as st
4
- import requests
5
  from PyPDF2 import PdfReader
6
- from tempfile import NamedTemporaryFile
7
-
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain_community.vectorstores import FAISS
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -13,22 +10,11 @@ from groq import Groq
13
  # Initialize Groq client
14
  client = Groq(api_key=os.environ['GROQ_API_KEY'])
15
 
16
- # Function to extract file ID from Google Drive URL
17
- def extract_drive_file_id(url):
18
- if "drive.google.com" in url:
19
- parts = url.split("/file/d/")
20
- if len(parts) > 1:
21
- return parts[1].split("/")[0]
22
- return None
23
-
24
- # Download and save PDF from Google Drive using gdown
25
- def download_pdf_from_url(url):
26
- file_id = extract_drive_file_id(url)
27
- if not file_id:
28
- return None
29
- output_path = f"/tmp/{file_id}.pdf"
30
  try:
31
- gdown.download(id=file_id, output=output_path, quiet=False)
32
  return output_path
33
  except Exception as e:
34
  print(f"Download failed: {e}")
@@ -87,7 +73,7 @@ vector_db = None
87
  # Auto-fetch and process each PDF
88
  for idx, link in enumerate(doc_links):
89
  st.write(f"📥 Fetching and processing PDF {idx + 1}...")
90
- pdf_path = download_pdf_from_url(link)
91
  if pdf_path:
92
  try:
93
  text = extract_text_from_pdf(pdf_path)
@@ -107,3 +93,4 @@ if user_query and vector_db:
107
  st.write(response)
108
  elif user_query:
109
  st.warning("⚠️ No documents available to query.")
 
 
1
  import os
2
  import gdown
3
  import streamlit as st
 
4
  from PyPDF2 import PdfReader
 
 
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
10
  # Initialize Groq client
11
  client = Groq(api_key=os.environ['GROQ_API_KEY'])
12
 
13
+ # Download and save PDF using gdown (fuzzy handles complex links)
14
+ def download_pdf_from_url(url, idx):
15
+ output_path = f"/tmp/doc_{idx}.pdf"
 
 
 
 
 
 
 
 
 
 
 
16
  try:
17
+ gdown.download(url=url, output=output_path, quiet=False, fuzzy=True)
18
  return output_path
19
  except Exception as e:
20
  print(f"Download failed: {e}")
 
73
  # Auto-fetch and process each PDF
74
  for idx, link in enumerate(doc_links):
75
  st.write(f"📥 Fetching and processing PDF {idx + 1}...")
76
+ pdf_path = download_pdf_from_url(link, idx)
77
  if pdf_path:
78
  try:
79
  text = extract_text_from_pdf(pdf_path)
 
93
  st.write(response)
94
  elif user_query:
95
  st.warning("⚠️ No documents available to query.")
96
+