File size: 5,892 Bytes
49adc11
 
 
 
 
 
9376d2c
 
5c5f6f8
 
 
 
9376d2c
 
 
49adc11
 
 
 
 
 
5c5f6f8
49adc11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c5f6f8
49adc11
 
 
 
 
 
 
 
 
 
 
 
5c5f6f8
49adc11
 
 
 
 
 
 
 
5c5f6f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49adc11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c5f6f8
49adc11
 
 
 
 
 
 
 
5c5f6f8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from typing import List, Dict

class GoogleDriveService:
    def __init__(self, credentials_dict: dict):
        """
        Initialize with credentials dictionary (works with both file and env variable)
        
        Args:
            credentials_dict: Google service account credentials as dictionary
        """
        self.credentials = service_account.Credentials.from_service_account_info(
            credentials_dict,
            scopes=[
                'https://www.googleapis.com/auth/drive.readonly',
                'https://www.googleapis.com/auth/documents.readonly'
            ]
        )
        self.drive_service = build('drive', 'v3', credentials=self.credentials)
        self.docs_service = build('docs', 'v1', credentials=self.credentials)
    
    def list_documents_in_folder(self, folder_id: str) -> List[Dict[str, str]]:
        """List all Google Docs in a folder"""
        try:
            query = f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.document' and trashed=false"
            
            results = self.drive_service.files().list(
                q=query,
                fields="files(id, name, modifiedTime)",
                orderBy="modifiedTime desc"
            ).execute()
            
            files = results.get('files', [])
            
            return [
                {
                    'id': file['id'],
                    'name': file['name'],
                    'modified': file.get('modifiedTime', 'Unknown')
                }
                for file in files
            ]
        
        except HttpError as e:
            if e.resp.status == 404:
                raise Exception(
                    f"Folder not found. Please check:\n"
                    f"1. The folder ID is correct\n"
                    f"2. The folder exists in Google Drive"
                )
            elif e.resp.status == 403:
                raise Exception(
                    f"Permission denied. Please ensure:\n"
                    f"1. The folder is shared with your service account\n"
                    f"2. Service account email has at least 'Viewer' access\n"
                    f"3. Check your GOOGLE_DRIVE_FOLDER_ID in environment variables"
                )
            else:
                raise Exception(f"Error accessing Google Drive: {str(e)}")
        except Exception as e:
            raise Exception(f"Error listing documents in folder: {str(e)}")
    
    def get_document_content(self, document_id: str) -> str:
        """Get content of a specific document"""
        try:
            document = self.docs_service.documents().get(documentId=document_id).execute()
            return self._extract_text(document)
        except HttpError as e:
            if e.resp.status == 404:
                raise Exception(f"Document not found. Please check the document ID: {document_id}")
            elif e.resp.status == 403:
                raise Exception(
                    f"Permission denied. Please ensure:\n"
                    f"1. The document is shared with your service account\n"
                    f"2. The service account has at least 'Viewer' access\n"
                    f"3. The document is not private/restricted"
                )
            else:
                raise Exception(f"Error reading document: {str(e)}")
        except Exception as e:
            raise Exception(f"Error reading document: {str(e)}")
    
    def _extract_text(self, document: dict) -> str:
        """Extract plain text from document structure"""
        text_parts = []
        
        content = document.get('body', {}).get('content', [])
        
        for element in content:
            if 'paragraph' in element:
                paragraph = element['paragraph']
                for text_element in paragraph.get('elements', []):
                    if 'textRun' in text_element:
                        text_parts.append(text_element['textRun']['content'])
            
            elif 'table' in element:
                table = element['table']
                for row in table.get('tableRows', []):
                    for cell in row.get('tableCells', []):
                        for cell_content in cell.get('content', []):
                            if 'paragraph' in cell_content:
                                paragraph = cell_content['paragraph']
                                for text_element in paragraph.get('elements', []):
                                    if 'textRun' in text_element:
                                        text_parts.append(text_element['textRun']['content'])
        
        return ''.join(text_parts).strip()
    
    def get_document_metadata(self, document_id: str) -> Dict[str, str]:
        """Get metadata for a document"""
        try:
            file = self.drive_service.files().get(
                fileId=document_id,
                fields="id, name, modifiedTime, createdTime, webViewLink"
            ).execute()
            
            return {
                'id': file['id'],
                'name': file['name'],
                'modified': file.get('modifiedTime', 'Unknown'),
                'created': file.get('createdTime', 'Unknown'),
                'url': file.get('webViewLink', '')
            }
        
        except HttpError as e:
            if e.resp.status == 404:
                raise Exception(f"Document not found: {document_id}")
            elif e.resp.status == 403:
                raise Exception(f"Permission denied for document: {document_id}")
            else:
                raise Exception(f"Error getting document metadata: {str(e)}")
        except Exception as e:
            raise Exception(f"Error getting document metadata: {str(e)}")