jebin2 commited on
Commit
34471b9
·
1 Parent(s): 05b481f

data flow via google sheet

Browse files
requirements.txt CHANGED
@@ -15,6 +15,7 @@ google-cloud-speech==2.34.0
15
  google-api-python-client==2.184.0
16
  google-auth-oauthlib==1.2.3
17
  librosa==0.11.0
 
18
 
19
  # aiosignal==1.4.0
20
  # annotated-types==0.7.0
 
15
  google-api-python-client==2.184.0
16
  google-auth-oauthlib==1.2.3
17
  librosa==0.11.0
18
+ gspread
19
 
20
  # aiosignal==1.4.0
21
  # annotated-types==0.7.0
src/automation.py CHANGED
@@ -381,6 +381,7 @@ class ContentAutomation:
381
  if os.getenv("INFLOXA", "false").lower() == "true":
382
  from video_downloader import VideoDownloader
383
  download_path="testData/infloxa"
 
384
 
385
  allowed_videos = []
386
 
 
381
  if os.getenv("INFLOXA", "false").lower() == "true":
382
  from video_downloader import VideoDownloader
383
  download_path="testData/infloxa"
384
+ Path(download_path).mkdir(parents=True, exist_ok=True)
385
 
386
  allowed_videos = []
387
 
src/get_refresh_token.py CHANGED
@@ -1,43 +1,24 @@
1
- """
2
- get_refresh_token.py
3
- ---------------------
4
- Use this script to generate a YouTube OAuth refresh token.
5
- Run locally, log in with the YouTube account you want to upload to.
6
- Then copy the printed REFRESH_TOKEN into your GitHub Secrets.
7
- """
8
-
9
  from google_auth_oauthlib.flow import InstalledAppFlow
 
 
 
10
 
11
- SCOPES = ['https://www.googleapis.com/auth/youtube.upload']
 
12
 
13
  def main():
14
- print("\n🔑 Starting YouTube OAuth flow...")
15
- print("A browser window will open — log in with the YouTube account you want to upload to.\n")
16
 
17
- flow = InstalledAppFlow.from_client_config(
18
- {
19
- "installed": {
20
- "client_id": input("Enter your CLIENT_ID: ").strip(),
21
- "client_secret": input("Enter your CLIENT_SECRET: ").strip(),
22
- "redirect_uris": ["http://localhost"],
23
- "auth_uri": "https://accounts.google.com/o/oauth2/auth",
24
- "token_uri": "https://oauth2.googleapis.com/token"
25
- }
26
- },
27
  SCOPES
28
  )
29
 
30
  creds = flow.run_local_server(port=0)
31
- print("\n✅ Login successful!")
32
- print("------------------------------------------------------")
33
- print("📘 REFRESH TOKEN (save this in GitHub Secrets):")
34
  print(creds.refresh_token)
35
- print("------------------------------------------------------")
36
- print("\nExample GitHub Secrets setup:")
37
- print(" YOUTUBE_CLIENT_ID=<your client id>")
38
- print(" YOUTUBE_CLIENT_SECRET=<your client secret>")
39
- print(" YOUTUBE_REFRESH_TOKEN=<this printed value>")
40
- print("------------------------------------------------------")
41
 
42
  if __name__ == "__main__":
43
  main()
 
 
 
 
 
 
 
 
 
1
  from google_auth_oauthlib.flow import InstalledAppFlow
2
+ import os
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
 
6
+ # Use YouTube scope (not Drive)
7
+ SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
8
 
9
  def main():
10
+ print("🔑 Starting OAuth flow...")
 
11
 
12
+ flow = InstalledAppFlow.from_client_secrets_file(
13
+ os.getenv("SERVER_GOOGLE_CLIENT_FILE"),
 
 
 
 
 
 
 
 
14
  SCOPES
15
  )
16
 
17
  creds = flow.run_local_server(port=0)
18
+
19
+ print("\n✅ AUTH SUCCESS")
20
+ print("REFRESH TOKEN:\n")
21
  print(creds.refresh_token)
 
 
 
 
 
 
22
 
23
  if __name__ == "__main__":
24
  main()
src/google_sheet_reader.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ import tempfile
4
+ from pathlib import Path
5
+
6
+ import gspread
7
+ from google.auth import default
8
+ from google.auth.credentials import Credentials as BaseCredentials
9
+
10
+ from utils import logger
11
+
12
+
13
+ class GoogleSheetReader:
14
+ """
15
+ Utility class to read & write Google Sheets using ADC (Local) / WIF (CI)
16
+ """
17
+
18
+ SCOPES = [
19
+ "https://www.googleapis.com/auth/spreadsheets",
20
+ "https://www.googleapis.com/auth/drive.file",
21
+ ]
22
+
23
+ def __init__(self):
24
+ logger.info("Initializing GoogleSheetReader")
25
+
26
+ # Support both name and ID
27
+ self.sheet_name = os.getenv("GSHEET_NAME")
28
+ self.sheet_id = os.getenv("GSHEET_ID")
29
+
30
+ if not self.sheet_name and not self.sheet_id:
31
+ logger.error("Must provide either GSHEET_NAME or GSHEET_ID")
32
+ raise RuntimeError("Must provide either GSHEET_NAME or GSHEET_ID")
33
+
34
+ self.worksheet_name = os.getenv("GSHEET_WORKSHEET")
35
+
36
+ logger.debug(
37
+ "Sheet config | name=%s | id=%s | worksheet=%s",
38
+ self.sheet_name or "N/A",
39
+ self.sheet_id or "N/A",
40
+ self.worksheet_name or "sheet1",
41
+ )
42
+
43
+ self.client = self._authorize()
44
+ self.sheet = self._open_sheet()
45
+
46
+ # -------- Temp directory handling --------
47
+ self.temp_dir = self._init_temp_dir()
48
+ logger.info("Using temp directory: %s", self.temp_dir)
49
+
50
+ # ------------------ Internal helpers ------------------
51
+
52
+ def _get_env(self, key: str) -> str:
53
+ value = os.getenv(key)
54
+ if not value:
55
+ logger.error("Missing required env variable: %s", key)
56
+ raise RuntimeError(f"Missing required env variable: {key}")
57
+ return value
58
+
59
+ def _authorize(self):
60
+ logger.info("Authorizing with Application Default Credentials (ADC)")
61
+ creds, _ = default()
62
+
63
+ # IMPORTANT: re-scope user ADC credentials
64
+ if isinstance(creds, BaseCredentials) and hasattr(creds, "with_scopes"):
65
+ creds = creds.with_scopes(self.SCOPES)
66
+
67
+ return gspread.authorize(creds)
68
+
69
+ def _open_sheet(self):
70
+ """Open spreadsheet by ID (preferred) or name"""
71
+ try:
72
+ if self.sheet_id:
73
+ logger.info("Opening spreadsheet by ID: %s", self.sheet_id)
74
+ spreadsheet = self.client.open_by_key(self.sheet_id)
75
+ else:
76
+ logger.info("Opening spreadsheet by name: %s", self.sheet_name)
77
+ spreadsheet = self.client.open(self.sheet_name)
78
+ except gspread.SpreadsheetNotFound:
79
+ logger.error("Spreadsheet not found!")
80
+ logger.error("Available spreadsheets:")
81
+ try:
82
+ for sheet in self.client.openall():
83
+ logger.error(" - %s (ID: %s)", sheet.title, sheet.id)
84
+ except Exception as e:
85
+ logger.error("Could not list spreadsheets: %s", e)
86
+ raise
87
+
88
+ if self.worksheet_name:
89
+ logger.info("Opening worksheet: %s", self.worksheet_name)
90
+ return spreadsheet.worksheet(self.worksheet_name)
91
+
92
+ logger.info("Opening default worksheet (sheet1)")
93
+ return spreadsheet.sheet1
94
+
95
+ def _init_temp_dir(self) -> Path:
96
+ """
97
+ Creates a temp directory.
98
+ Uses fixed path during test automation if configured.
99
+ """
100
+ if os.getenv("TEST_AUTOMATION", "").lower() == "true":
101
+ base_dir = os.getenv("TEST_DATA_DIRECTORY")
102
+ if not base_dir:
103
+ raise RuntimeError("TEST_DATA_DIRECTORY must be set when TEST_AUTOMATION=true")
104
+
105
+ path = Path(base_dir) / "output"
106
+ path.mkdir(parents=True, exist_ok=True)
107
+ return path
108
+
109
+ return Path(tempfile.mkdtemp(prefix="gsheet_"))
110
+
111
+ # ------------------ Sheet creation helpers ------------------
112
+
113
+ def _get_or_create_spreadsheet(self, sheet_name: str):
114
+ try:
115
+ logger.info("Opening spreadsheet: %s", sheet_name)
116
+ return self.client.open(sheet_name)
117
+ except gspread.SpreadsheetNotFound:
118
+ logger.warning("Spreadsheet not found. Creating new one: %s", sheet_name)
119
+ return self.client.create(sheet_name)
120
+
121
+ def _get_or_create_worksheet(
122
+ self,
123
+ spreadsheet,
124
+ worksheet_name: str,
125
+ rows: int = 1000,
126
+ cols: int = 26,
127
+ ):
128
+ try:
129
+ logger.info("Opening worksheet: %s", worksheet_name)
130
+ return spreadsheet.worksheet(worksheet_name)
131
+ except gspread.WorksheetNotFound:
132
+ logger.warning("Worksheet not found. Creating: %s", worksheet_name)
133
+ return spreadsheet.add_worksheet(
134
+ title=worksheet_name,
135
+ rows=rows,
136
+ cols=cols,
137
+ )
138
+
139
+ def _ensure_header(self, worksheet, header: list[str]) -> list[str]:
140
+ """
141
+ Ensures header row exists and contains all required columns.
142
+ Returns final header order.
143
+ """
144
+ existing = worksheet.row_values(1)
145
+
146
+ # Case 1: Empty sheet
147
+ if not existing:
148
+ logger.info("Sheet empty. Writing header.")
149
+ worksheet.insert_row(header, index=1)
150
+ return header
151
+
152
+ # Case 2: Header exists, check for missing columns
153
+ updated = False
154
+ final_header = existing.copy()
155
+
156
+ for col in header:
157
+ if col not in final_header:
158
+ logger.info("Adding missing header column: %s", col)
159
+ final_header.append(col)
160
+ updated = True
161
+
162
+ if updated:
163
+ worksheet.update(values=[final_header], range_name="1:1")
164
+
165
+ return final_header
166
+
167
+
168
+ # ------------------ Core APIs ------------------
169
+
170
+ def get_all_values(self) -> list:
171
+ logger.debug("Fetching all values from sheet")
172
+ return self.sheet.get_all_values()
173
+
174
+ def get_dataframe(self):
175
+ logger.debug("Converting sheet to DataFrame")
176
+ import pandas as pd
177
+
178
+ data = self.get_all_values()
179
+ return pd.DataFrame(data[1:], columns=data[0])
180
+
181
+ # ------------------ Filtering APIs ------------------
182
+
183
+ def filter_rows(
184
+ self,
185
+ column_name: str = "STATUS",
186
+ match_value: str = "SELECTED",
187
+ case_insensitive: bool = True,
188
+ ) -> list:
189
+ logger.info(
190
+ "Filtering rows | column=%s | value=%s | case_insensitive=%s",
191
+ column_name,
192
+ match_value,
193
+ case_insensitive,
194
+ )
195
+
196
+ rows = self.get_all_values()
197
+ if not rows:
198
+ logger.warning("Sheet is empty")
199
+ return []
200
+
201
+ header = rows[0]
202
+
203
+ try:
204
+ col_idx = header.index(column_name)
205
+ except ValueError:
206
+ logger.error("Column '%s' not found in sheet", column_name)
207
+ raise RuntimeError(f"Column '{column_name}' not found in sheet")
208
+
209
+ def _match(val: str) -> bool:
210
+ if case_insensitive:
211
+ return val.strip().lower() == match_value.lower()
212
+ return val == match_value
213
+
214
+ filtered = [
215
+ row for row in rows[1:]
216
+ if len(row) > col_idx and _match(row[col_idx])
217
+ ]
218
+
219
+ logger.info(
220
+ "Filtered rows: %d / %d",
221
+ len(filtered),
222
+ max(len(rows) - 1, 0),
223
+ )
224
+
225
+ return [header] + filtered
226
+
227
+ def get_filtered_dataframe(
228
+ self,
229
+ column_name: str = "STATUS",
230
+ match_value: str = "SELECTED",
231
+ case_insensitive: bool = True,
232
+ ):
233
+ logger.debug("Creating filtered DataFrame")
234
+ import pandas as pd
235
+
236
+ rows = self.filter_rows(
237
+ column_name=column_name,
238
+ match_value=match_value,
239
+ case_insensitive=case_insensitive,
240
+ )
241
+
242
+ if len(rows) <= 1:
243
+ logger.warning("No matching rows found")
244
+ return pd.DataFrame(columns=rows[0] if rows else [])
245
+
246
+ return pd.DataFrame(rows[1:], columns=rows[0])
247
+
248
+ def create_or_update_sheet(
249
+ self,
250
+ sheet_name: str | None = None,
251
+ worksheet_name: str | None = None,
252
+ header: list[str] = None,
253
+ values: list[dict] = None,
254
+ ):
255
+ """
256
+ Create or update a sheet + worksheet.
257
+ Ensures headers exist and appends values.
258
+
259
+ If sheet_name or worksheet_name not provided, uses instance's sheet.
260
+
261
+ values: List of dicts where keys match header names
262
+ """
263
+
264
+ if not values:
265
+ logger.warning("No values provided. Nothing to append.")
266
+ return
267
+
268
+ # Determine which spreadsheet to use
269
+ if sheet_name is None:
270
+ # No sheet_name provided - use instance's sheet
271
+ if self.sheet_id:
272
+ # Prefer ID over name (more reliable)
273
+ logger.info("Using instance sheet by ID: %s", self.sheet_id)
274
+ spreadsheet = self.client.open_by_key(self.sheet_id)
275
+ elif self.sheet_name:
276
+ logger.info("Using instance sheet name: %s", self.sheet_name)
277
+ spreadsheet = self._get_or_create_spreadsheet(self.sheet_name)
278
+ else:
279
+ raise ValueError("No sheet_name provided and no instance sheet configured")
280
+ else:
281
+ # sheet_name provided - check if it matches instance's sheet
282
+ if self.sheet_name and sheet_name == self.sheet_name and self.sheet_id:
283
+ # Same sheet name and we have an ID - use the ID for reliability
284
+ logger.info("Sheet name matches instance. Using ID: %s", self.sheet_id)
285
+ spreadsheet = self.client.open_by_key(self.sheet_id)
286
+ else:
287
+ # Different sheet or no ID - open/create by name
288
+ logger.info("Opening/creating sheet by name: %s", sheet_name)
289
+ spreadsheet = self._get_or_create_spreadsheet(sheet_name)
290
+
291
+ # Use instance's worksheet if not specified
292
+ if worksheet_name is None:
293
+ worksheet_name = self.worksheet_name or "Sheet1"
294
+ logger.info("Using worksheet name: %s", worksheet_name)
295
+
296
+ worksheet = self._get_or_create_worksheet(spreadsheet, worksheet_name)
297
+
298
+ final_header = self._ensure_header(worksheet, header)
299
+
300
+ # Convert dict rows -> ordered list rows
301
+ rows_to_append = []
302
+ for item in values:
303
+ row = [item.get(col, "") for col in final_header]
304
+ rows_to_append.append(row)
305
+
306
+ logger.info(
307
+ "Appending %d rows to %s / %s",
308
+ len(rows_to_append),
309
+ sheet_name or f"ID:{self.sheet_id}" if self.sheet_id else self.sheet_name,
310
+ worksheet_name,
311
+ )
312
+
313
+ worksheet.append_rows(
314
+ rows_to_append,
315
+ value_input_option="USER_ENTERED",
316
+ )
317
+
318
+
319
+ # ------------------ CSV Export ------------------
320
+
321
+ def export_csv(
322
+ self,
323
+ output_path: str | None = None,
324
+ column_name: str | None = None,
325
+ match_value: str = "SELECTED",
326
+ case_insensitive: bool = True,
327
+ ) -> Path:
328
+ """
329
+ Export sheet (or filtered rows) to CSV.
330
+ If output_path is None, file is written inside temp_dir.
331
+ """
332
+ if output_path:
333
+ output_path = Path(output_path)
334
+ else:
335
+ output_path = self.temp_dir / "sheet_export.csv"
336
+
337
+ logger.info("Exporting CSV to %s", output_path)
338
+
339
+ if column_name:
340
+ rows = self.filter_rows(
341
+ column_name=column_name,
342
+ match_value=match_value,
343
+ case_insensitive=case_insensitive,
344
+ )
345
+ else:
346
+ rows = self.get_all_values()
347
+
348
+ output_path.parent.mkdir(parents=True, exist_ok=True)
349
+
350
+ with output_path.open("w", newline="", encoding="utf-8") as f:
351
+ writer = csv.writer(f)
352
+ writer.writerows(rows)
353
+
354
+ logger.info("CSV export completed | rows=%d", len(rows) - 1)
355
+ return output_path
356
+
357
+
358
+ # ------------------ CLI entrypoint ------------------
359
+
360
+ def main():
361
+ """
362
+ Examples are controlled via env flags:
363
+ - EXAMPLE_WRITE=true
364
+ - EXAMPLE_FILTER_EXPORT=true
365
+ """
366
+
367
+ try:
368
+ from dotenv import load_dotenv
369
+ load_dotenv()
370
+ except ImportError:
371
+ pass
372
+
373
+ reader = GoogleSheetReader()
374
+
375
+ # ------------------ EXAMPLE 1: CREATE / UPDATE SHEET ------------------
376
+ logger.info("Running EXAMPLE_WRITE")
377
+
378
+ reader.create_or_update_sheet(
379
+ # sheet_name="Elvoro Example Output",
380
+ # worksheet_name="Run_Example",
381
+ header=["ID", "STATUS", "MESSAGE", "DURATION"],
382
+ values=[
383
+ {"ID": "1", "STATUS": "SUCCESS", "MESSAGE": "Video done", "DURATION": "10s"},
384
+ {"ID": "2", "STATUS": "FAILED", "MESSAGE": "Render error"},
385
+ ],
386
+ )
387
+
388
+ # ------------------ EXAMPLE 2: FILTER + EXPORT CSV ------------------
389
+ # logger.info("Running EXAMPLE_FILTER_EXPORT")
390
+
391
+ # csv_path = reader.export_csv(
392
+ # output_path=os.getenv("OUTPUT_CSV"),
393
+ # column_name="STATUS",
394
+ # match_value="SELECTED",
395
+ # )
396
+
397
+ # logger.info("Filtered CSV written to %s", csv_path)
398
+
399
+ # ------------------ DEFAULT: EXPORT FULL SHEET ------------------
400
+ # csv_path = reader.export_csv(os.getenv("OUTPUT_CSV"))
401
+ # logger.info("Full sheet exported to %s", csv_path)
402
+
403
+
404
+ if __name__ == "__main__":
405
+ main()