Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import gdown
|
| 3 |
import streamlit as st
|
| 4 |
-
import requests
|
| 5 |
from PyPDF2 import PdfReader
|
| 6 |
-
from tempfile import NamedTemporaryFile
|
| 7 |
-
|
| 8 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 9 |
from langchain_community.vectorstores import FAISS
|
| 10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
@@ -13,22 +10,11 @@ from groq import Groq
|
|
| 13 |
# Initialize Groq client
|
| 14 |
client = Groq(api_key=os.environ['GROQ_API_KEY'])
|
| 15 |
|
| 16 |
-
#
|
| 17 |
-
def
|
| 18 |
-
|
| 19 |
-
parts = url.split("/file/d/")
|
| 20 |
-
if len(parts) > 1:
|
| 21 |
-
return parts[1].split("/")[0]
|
| 22 |
-
return None
|
| 23 |
-
|
| 24 |
-
# Download and save PDF from Google Drive using gdown
|
| 25 |
-
def download_pdf_from_url(url):
|
| 26 |
-
file_id = extract_drive_file_id(url)
|
| 27 |
-
if not file_id:
|
| 28 |
-
return None
|
| 29 |
-
output_path = f"/tmp/{file_id}.pdf"
|
| 30 |
try:
|
| 31 |
-
gdown.download(
|
| 32 |
return output_path
|
| 33 |
except Exception as e:
|
| 34 |
print(f"Download failed: {e}")
|
|
@@ -87,7 +73,7 @@ vector_db = None
|
|
| 87 |
# Auto-fetch and process each PDF
|
| 88 |
for idx, link in enumerate(doc_links):
|
| 89 |
st.write(f"📥 Fetching and processing PDF {idx + 1}...")
|
| 90 |
-
pdf_path = download_pdf_from_url(link)
|
| 91 |
if pdf_path:
|
| 92 |
try:
|
| 93 |
text = extract_text_from_pdf(pdf_path)
|
|
@@ -107,3 +93,4 @@ if user_query and vector_db:
|
|
| 107 |
st.write(response)
|
| 108 |
elif user_query:
|
| 109 |
st.warning("⚠️ No documents available to query.")
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import gdown
|
| 3 |
import streamlit as st
|
|
|
|
| 4 |
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
| 5 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 6 |
from langchain_community.vectorstores import FAISS
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
| 10 |
# Initialize Groq client
|
| 11 |
client = Groq(api_key=os.environ['GROQ_API_KEY'])
|
| 12 |
|
| 13 |
+
# Download and save PDF using gdown (fuzzy handles complex links)
|
| 14 |
+
def download_pdf_from_url(url, idx):
|
| 15 |
+
output_path = f"/tmp/doc_{idx}.pdf"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
try:
|
| 17 |
+
gdown.download(url=url, output=output_path, quiet=False, fuzzy=True)
|
| 18 |
return output_path
|
| 19 |
except Exception as e:
|
| 20 |
print(f"Download failed: {e}")
|
|
|
|
| 73 |
# Auto-fetch and process each PDF
|
| 74 |
for idx, link in enumerate(doc_links):
|
| 75 |
st.write(f"📥 Fetching and processing PDF {idx + 1}...")
|
| 76 |
+
pdf_path = download_pdf_from_url(link, idx)
|
| 77 |
if pdf_path:
|
| 78 |
try:
|
| 79 |
text = extract_text_from_pdf(pdf_path)
|
|
|
|
| 93 |
st.write(response)
|
| 94 |
elif user_query:
|
| 95 |
st.warning("⚠️ No documents available to query.")
|
| 96 |
+
|