telzho / src /libs /google_drive.py
LONGYKING
update
ed3fa91
import json
from google.oauth2 import service_account
from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup
def get_pdf_urls_with_root_url(root_url):
"""
Returns an array of URLs of PDF files in the specified remote directory.
Parameters:
root_url (str): The root URL to the directory listing.
Returns:
list: A list of URLs to the PDF files in the directory.
"""
pdf_urls = []
# Make a request to get the content of the directory
response = requests.get(root_url)
if response.status_code == 200:
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Iterate over all links in the directory listing
for link in soup.find_all('a'):
href = link.get('href')
if href and href.lower().endswith('.pdf'):
# Construct the full URL for the PDF file
file_url = root_url.rstrip('/') + '/' + href.lstrip('/')
pdf_urls.append(file_url)
else:
print(f"Failed to access {root_url}, status code: {response.status_code}")
return pdf_urls
def get_drive_file_urls(service_account_file, folder_id):
credentials = service_account.Credentials.from_service_account_file(
service_account_file,
scopes=['https://www.googleapis.com/auth/drive.readonly'],
)
service = build('drive', 'v3', credentials=credentials)
query = f"'{folder_id}' in parents"
results = service.files().list(q=query, fields="files(id, webViewLink)").execute()
items = results.get('files', [])
file_urls = [item['webViewLink'] for item in items]
return file_urls