import json from google.oauth2 import service_account from googleapiclient.discovery import build import requests from bs4 import BeautifulSoup def get_pdf_urls_with_root_url(root_url): """ Returns an array of URLs of PDF files in the specified remote directory. Parameters: root_url (str): The root URL to the directory listing. Returns: list: A list of URLs to the PDF files in the directory. """ pdf_urls = [] # Make a request to get the content of the directory response = requests.get(root_url) if response.status_code == 200: # Parse the HTML content soup = BeautifulSoup(response.content, 'html.parser') # Iterate over all links in the directory listing for link in soup.find_all('a'): href = link.get('href') if href and href.lower().endswith('.pdf'): # Construct the full URL for the PDF file file_url = root_url.rstrip('/') + '/' + href.lstrip('/') pdf_urls.append(file_url) else: print(f"Failed to access {root_url}, status code: {response.status_code}") return pdf_urls def get_drive_file_urls(service_account_file, folder_id): credentials = service_account.Credentials.from_service_account_file( service_account_file, scopes=['https://www.googleapis.com/auth/drive.readonly'], ) service = build('drive', 'v3', credentials=credentials) query = f"'{folder_id}' in parents" results = service.files().list(q=query, fields="files(id, webViewLink)").execute() items = results.get('files', []) file_urls = [item['webViewLink'] for item in items] return file_urls