Spaces:
Sleeping
Sleeping
File size: 5,892 Bytes
49adc11 9376d2c 5c5f6f8 9376d2c 49adc11 5c5f6f8 49adc11 5c5f6f8 49adc11 5c5f6f8 49adc11 5c5f6f8 49adc11 5c5f6f8 49adc11 5c5f6f8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from typing import List, Dict
class GoogleDriveService:
def __init__(self, credentials_dict: dict):
"""
Initialize with credentials dictionary (works with both file and env variable)
Args:
credentials_dict: Google service account credentials as dictionary
"""
self.credentials = service_account.Credentials.from_service_account_info(
credentials_dict,
scopes=[
'https://www.googleapis.com/auth/drive.readonly',
'https://www.googleapis.com/auth/documents.readonly'
]
)
self.drive_service = build('drive', 'v3', credentials=self.credentials)
self.docs_service = build('docs', 'v1', credentials=self.credentials)
def list_documents_in_folder(self, folder_id: str) -> List[Dict[str, str]]:
"""List all Google Docs in a folder"""
try:
query = f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.document' and trashed=false"
results = self.drive_service.files().list(
q=query,
fields="files(id, name, modifiedTime)",
orderBy="modifiedTime desc"
).execute()
files = results.get('files', [])
return [
{
'id': file['id'],
'name': file['name'],
'modified': file.get('modifiedTime', 'Unknown')
}
for file in files
]
except HttpError as e:
if e.resp.status == 404:
raise Exception(
f"Folder not found. Please check:\n"
f"1. The folder ID is correct\n"
f"2. The folder exists in Google Drive"
)
elif e.resp.status == 403:
raise Exception(
f"Permission denied. Please ensure:\n"
f"1. The folder is shared with your service account\n"
f"2. Service account email has at least 'Viewer' access\n"
f"3. Check your GOOGLE_DRIVE_FOLDER_ID in environment variables"
)
else:
raise Exception(f"Error accessing Google Drive: {str(e)}")
except Exception as e:
raise Exception(f"Error listing documents in folder: {str(e)}")
def get_document_content(self, document_id: str) -> str:
"""Get content of a specific document"""
try:
document = self.docs_service.documents().get(documentId=document_id).execute()
return self._extract_text(document)
except HttpError as e:
if e.resp.status == 404:
raise Exception(f"Document not found. Please check the document ID: {document_id}")
elif e.resp.status == 403:
raise Exception(
f"Permission denied. Please ensure:\n"
f"1. The document is shared with your service account\n"
f"2. The service account has at least 'Viewer' access\n"
f"3. The document is not private/restricted"
)
else:
raise Exception(f"Error reading document: {str(e)}")
except Exception as e:
raise Exception(f"Error reading document: {str(e)}")
def _extract_text(self, document: dict) -> str:
"""Extract plain text from document structure"""
text_parts = []
content = document.get('body', {}).get('content', [])
for element in content:
if 'paragraph' in element:
paragraph = element['paragraph']
for text_element in paragraph.get('elements', []):
if 'textRun' in text_element:
text_parts.append(text_element['textRun']['content'])
elif 'table' in element:
table = element['table']
for row in table.get('tableRows', []):
for cell in row.get('tableCells', []):
for cell_content in cell.get('content', []):
if 'paragraph' in cell_content:
paragraph = cell_content['paragraph']
for text_element in paragraph.get('elements', []):
if 'textRun' in text_element:
text_parts.append(text_element['textRun']['content'])
return ''.join(text_parts).strip()
def get_document_metadata(self, document_id: str) -> Dict[str, str]:
"""Get metadata for a document"""
try:
file = self.drive_service.files().get(
fileId=document_id,
fields="id, name, modifiedTime, createdTime, webViewLink"
).execute()
return {
'id': file['id'],
'name': file['name'],
'modified': file.get('modifiedTime', 'Unknown'),
'created': file.get('createdTime', 'Unknown'),
'url': file.get('webViewLink', '')
}
except HttpError as e:
if e.resp.status == 404:
raise Exception(f"Document not found: {document_id}")
elif e.resp.status == 403:
raise Exception(f"Permission denied for document: {document_id}")
else:
raise Exception(f"Error getting document metadata: {str(e)}")
except Exception as e:
raise Exception(f"Error getting document metadata: {str(e)}") |