Redfire-1234 commited on
Commit
b04d4ab
·
verified ·
1 Parent(s): 7cc17cd

Update app/services/google_docs.py

Browse files
Files changed (1) hide show
  1. app/services/google_docs.py +42 -33
app/services/google_docs.py CHANGED
@@ -1,7 +1,7 @@
1
  from google.oauth2 import service_account
2
  from googleapiclient.discovery import build
3
  from googleapiclient.errors import HttpError
4
- from typing import Optional
5
 
6
  class GoogleDocsReader:
7
  def __init__(self, credentials_dict: dict):
@@ -12,51 +12,60 @@ class GoogleDocsReader:
12
  credentials_dict,
13
  scopes=['https://www.googleapis.com/auth/documents.readonly']
14
  )
15
- self.docs_service = build('docs', 'v1', credentials=self.credentials)
16
 
17
  def read_document(self, document_id: str) -> str:
18
- """Read and extract text from Google Doc"""
 
 
19
  try:
 
20
  document = self.service.documents().get(documentId=document_id).execute()
21
- return self._extract_text(document)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  except HttpError as e:
23
- # Handle specific Google API errors
24
  if e.resp.status == 404:
25
- raise Exception(f"Document not found. Please check the document ID: {document_id}")
26
  elif e.resp.status == 403:
27
  raise Exception(
28
  f"Permission denied. Please ensure:\n"
29
  f"1. The document is shared with your service account\n"
30
- f"2. The service account has at least 'Viewer' access\n"
31
- f"3. The document is not private/restricted"
32
  )
33
  else:
34
  raise Exception(f"Error reading document: {str(e)}")
35
  except Exception as e:
36
  raise Exception(f"Error reading document: {str(e)}")
37
 
38
- def _extract_text(self, document: dict) -> str:
39
- """Extract plain text from document structure"""
40
- text_parts = []
41
-
42
- content = document.get('body', {}).get('content', [])
43
-
44
- for element in content:
45
- if 'paragraph' in element:
46
- paragraph = element['paragraph']
47
- for text_element in paragraph.get('elements', []):
48
- if 'textRun' in text_element:
49
- text_parts.append(text_element['textRun']['content'])
50
-
51
- elif 'table' in element:
52
- table = element['table']
53
- for row in table.get('tableRows', []):
54
- for cell in row.get('tableCells', []):
55
- for cell_content in cell.get('content', []):
56
- if 'paragraph' in cell_content:
57
- paragraph = cell_content['paragraph']
58
- for text_element in paragraph.get('elements', []):
59
- if 'textRun' in text_element:
60
- text_parts.append(text_element['textRun']['content'])
61
-
62
- return ''.join(text_parts).strip()
 
1
  from google.oauth2 import service_account
2
  from googleapiclient.discovery import build
3
  from googleapiclient.errors import HttpError
4
+ from typing import Dict
5
 
6
  class GoogleDocsReader:
7
  def __init__(self, credentials_dict: dict):
 
12
  credentials_dict,
13
  scopes=['https://www.googleapis.com/auth/documents.readonly']
14
  )
15
+ self.service = build('docs', 'v1', credentials=self.credentials)
16
 
17
  def read_document(self, document_id: str) -> str:
18
+ """
19
+ Read the content of a Google Doc and return as plain text
20
+ """
21
  try:
22
+ # Get the document
23
  document = self.service.documents().get(documentId=document_id).execute()
24
+
25
+ # Extract text content
26
+ content = document.get('body', {}).get('content', [])
27
+ text_parts = []
28
+
29
+ for element in content:
30
+ if 'paragraph' in element:
31
+ paragraph = element['paragraph']
32
+ for text_element in paragraph.get('elements', []):
33
+ if 'textRun' in text_element:
34
+ text_parts.append(text_element['textRun'].get('content', ''))
35
+ elif 'table' in element:
36
+ # Handle tables
37
+ table = element['table']
38
+ for row in table.get('tableRows', []):
39
+ for cell in row.get('tableCells', []):
40
+ for cell_element in cell.get('content', []):
41
+ if 'paragraph' in cell_element:
42
+ paragraph = cell_element['paragraph']
43
+ for text_element in paragraph.get('elements', []):
44
+ if 'textRun' in text_element:
45
+ text_parts.append(text_element['textRun'].get('content', ''))
46
+
47
+ return ''.join(text_parts)
48
+
49
  except HttpError as e:
 
50
  if e.resp.status == 404:
51
+ raise Exception(f"Document not found: {document_id}")
52
  elif e.resp.status == 403:
53
  raise Exception(
54
  f"Permission denied. Please ensure:\n"
55
  f"1. The document is shared with your service account\n"
56
+ f"2. Service account has at least 'Viewer' access"
 
57
  )
58
  else:
59
  raise Exception(f"Error reading document: {str(e)}")
60
  except Exception as e:
61
  raise Exception(f"Error reading document: {str(e)}")
62
 
63
+ def get_document_title(self, document_id: str) -> str:
64
+ """
65
+ Get the title of a Google Doc
66
+ """
67
+ try:
68
+ document = self.service.documents().get(documentId=document_id).execute()
69
+ return document.get('title', 'Untitled')
70
+ except Exception as e:
71
+ raise Exception(f"Error getting document title: {str(e)}")