|
|
import json |
|
|
from google.oauth2 import service_account |
|
|
from googleapiclient.discovery import build |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
|
def get_pdf_urls_with_root_url(root_url): |
|
|
""" |
|
|
Returns an array of URLs of PDF files in the specified remote directory. |
|
|
|
|
|
Parameters: |
|
|
root_url (str): The root URL to the directory listing. |
|
|
|
|
|
Returns: |
|
|
list: A list of URLs to the PDF files in the directory. |
|
|
""" |
|
|
pdf_urls = [] |
|
|
|
|
|
|
|
|
response = requests.get(root_url) |
|
|
|
|
|
if response.status_code == 200: |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
for link in soup.find_all('a'): |
|
|
href = link.get('href') |
|
|
if href and href.lower().endswith('.pdf'): |
|
|
|
|
|
file_url = root_url.rstrip('/') + '/' + href.lstrip('/') |
|
|
pdf_urls.append(file_url) |
|
|
else: |
|
|
print(f"Failed to access {root_url}, status code: {response.status_code}") |
|
|
|
|
|
return pdf_urls |
|
|
|
|
|
|
|
|
def get_drive_file_urls(service_account_file, folder_id): |
|
|
credentials = service_account.Credentials.from_service_account_file( |
|
|
service_account_file, |
|
|
scopes=['https://www.googleapis.com/auth/drive.readonly'], |
|
|
) |
|
|
|
|
|
service = build('drive', 'v3', credentials=credentials) |
|
|
|
|
|
query = f"'{folder_id}' in parents" |
|
|
results = service.files().list(q=query, fields="files(id, webViewLink)").execute() |
|
|
items = results.get('files', []) |
|
|
|
|
|
file_urls = [item['webViewLink'] for item in items] |
|
|
|
|
|
return file_urls |
|
|
|