VyLala commited on
Commit
46054b9
Β·
verified Β·
1 Parent(s): 792f983

Update mtdna_backend.py

Browse files
Files changed (1) hide show
  1. mtdna_backend.py +151 -52
mtdna_backend.py CHANGED
@@ -151,6 +151,12 @@ def get_incomplete_accessions(file_path):
151
 
152
  # GOOGLE_SHEET_NAME = "known_samples"
153
  # USAGE_DRIVE_FILENAME = "user_usage_log.json"
 
 
 
 
 
 
154
 
155
  async def summarize_results(accession, stop_flag=None):
156
  # Early bail
@@ -234,30 +240,52 @@ async def summarize_results(accession, stop_flag=None):
234
  label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
235
  else: label = key
236
  if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
 
 
 
 
 
 
 
 
 
237
  row = {
238
- "Sample ID": label or "unknown",
239
- "Predicted Country": pred_country or "unknown",
240
- "Country Explanation": country_explanation or "unknown",
241
- "Predicted Sample Type":pred_sample or "unknown",
242
- "Sample Type Explanation":sample_explanation or "unknown",
243
- "Sources": "\n".join(outputs[key]["source"]) or "No Links",
244
- "Time cost": outputs[key]["time_cost"]
245
  }
246
  #row_score.append(row)
247
  rows.append(list(row.values()))
248
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  save_row = {
250
- "Sample ID": label or "unknown",
251
- "Predicted Country": pred_country or "unknown",
252
- "Country Explanation": country_explanation or "unknown",
253
- "Predicted Sample Type":pred_sample or "unknown",
254
- "Sample Type Explanation":sample_explanation or "unknown",
255
- "Sources": "\n".join(outputs[key]["source"]) or "No Links",
256
  "Query_cost": outputs[key]["query_cost"] or "",
257
  "Time cost": outputs[key]["time_cost"] or "",
258
- "file_chunk":outputs[key]["file_chunk"] or "",
259
- "file_all_output":outputs[key]["file_all_output"] or ""
260
  }
 
261
  #row_score.append(row)
262
  save_rows.append(list(save_row.values()))
263
 
@@ -311,6 +339,79 @@ async def summarize_results(accession, stop_flag=None):
311
 
312
  # except Exception as e:
313
  # print(f"⚠️ Failed to save known output to Google Sheets: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  try:
315
  # Prepare as DataFrame
316
  df_new = pd.DataFrame(save_rows, columns=[
@@ -318,7 +419,7 @@ async def summarize_results(accession, stop_flag=None):
318
  "Predicted Sample Type", "Sample Type Explanation",
319
  "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
320
  ])
321
-
322
  # βœ… Setup Google Sheets
323
  creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
324
  scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
@@ -326,44 +427,42 @@ async def summarize_results(accession, stop_flag=None):
326
  client = gspread.authorize(creds)
327
  spreadsheet = client.open("known_samples")
328
  sheet = spreadsheet.sheet1
329
-
330
- # βœ… Read existing data
331
  existing_data = sheet.get_all_values()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
- if existing_data:
334
- df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
335
-
336
- else:
337
-
338
- df_old = pd.DataFrame(columns=[
339
- "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
340
- "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
341
- "Query_cost", "Sample Type Explanation", "Sources", "Time cost", "file_chunk", "file_all_output"
342
- ])
343
-
344
-
345
- # βœ… Index by Sample ID
346
- df_old.set_index("Sample ID", inplace=True)
347
- df_new.set_index("Sample ID", inplace=True)
348
-
349
- # βœ… Update only matching fields
350
- update_columns = [
351
- "Predicted Country", "Predicted Sample Type", "Country Explanation",
352
- "Sample Type Explanation", "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
353
- ]
354
- for idx, row in df_new.iterrows():
355
- if idx not in df_old.index:
356
- df_old.loc[idx] = "" # new row, fill empty first
357
- for col in update_columns:
358
- if pd.notna(row[col]) and row[col] != "":
359
- df_old.at[idx, col] = row[col]
360
-
361
- # βœ… Reset and write back
362
- df_old.reset_index(inplace=True)
363
- sheet.clear()
364
- sheet.update([df_old.columns.values.tolist()] + df_old.values.tolist())
365
- print("βœ… Match results saved to known_samples.")
366
-
367
  except Exception as e:
368
  print(f"❌ Failed to update known_samples: {e}")
369
 
 
151
 
152
  # GOOGLE_SHEET_NAME = "known_samples"
153
  # USAGE_DRIVE_FILENAME = "user_usage_log.json"
154
+ def truncate_cell(value, max_len=49000):
155
+ """Ensure cell content never exceeds Google Sheets 50k char limit."""
156
+ if not isinstance(value, str):
157
+ value = str(value)
158
+ return value[:max_len] + ("... [TRUNCATED]" if len(value) > max_len else "")
159
+
160
 
161
  async def summarize_results(accession, stop_flag=None):
162
  # Early bail
 
240
  label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
241
  else: label = key
242
  if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
243
+ # row = {
244
+ # "Sample ID": label or "unknown",
245
+ # "Predicted Country": pred_country or "unknown",
246
+ # "Country Explanation": country_explanation or "unknown",
247
+ # "Predicted Sample Type":pred_sample or "unknown",
248
+ # "Sample Type Explanation":sample_explanation or "unknown",
249
+ # "Sources": "\n".join(outputs[key]["source"]) or "No Links",
250
+ # "Time cost": outputs[key]["time_cost"]
251
+ # }
252
  row = {
253
+ "Sample ID": truncate_cell(label or "unknown"),
254
+ "Predicted Country": truncate_cell(pred_country or "unknown"),
255
+ "Country Explanation": truncate_cell(country_explanation or "unknown"),
256
+ "Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
257
+ "Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
258
+ "Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
259
+ "Time cost": truncate_cell(outputs[key]["time_cost"])
260
  }
261
  #row_score.append(row)
262
  rows.append(list(row.values()))
263
 
264
+ # save_row = {
265
+ # "Sample ID": label or "unknown",
266
+ # "Predicted Country": pred_country or "unknown",
267
+ # "Country Explanation": country_explanation or "unknown",
268
+ # "Predicted Sample Type":pred_sample or "unknown",
269
+ # "Sample Type Explanation":sample_explanation or "unknown",
270
+ # "Sources": "\n".join(outputs[key]["source"]) or "No Links",
271
+ # "Query_cost": outputs[key]["query_cost"] or "",
272
+ # "Time cost": outputs[key]["time_cost"] or "",
273
+ # "file_chunk":outputs[key]["file_chunk"] or "",
274
+ # "file_all_output":outputs[key]["file_all_output"] or ""
275
+ # }
276
  save_row = {
277
+ "Sample ID": truncate_cell(label or "unknown"),
278
+ "Predicted Country": truncate_cell(pred_country or "unknown"),
279
+ "Country Explanation": truncate_cell(country_explanation or "unknown"),
280
+ "Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
281
+ "Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
282
+ "Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
283
  "Query_cost": outputs[key]["query_cost"] or "",
284
  "Time cost": outputs[key]["time_cost"] or "",
285
+ "file_chunk": truncate_cell(outputs[key]["file_chunk"] or ""),
286
+ "file_all_output": truncate_cell(outputs[key]["file_all_output"] or "")
287
  }
288
+
289
  #row_score.append(row)
290
  save_rows.append(list(save_row.values()))
291
 
 
339
 
340
  # except Exception as e:
341
  # print(f"⚠️ Failed to save known output to Google Sheets: {e}")
342
+ # try:
343
+ # # Prepare as DataFrame
344
+ # df_new = pd.DataFrame(save_rows, columns=[
345
+ # "Sample ID", "Predicted Country", "Country Explanation",
346
+ # "Predicted Sample Type", "Sample Type Explanation",
347
+ # "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
348
+ # ])
349
+
350
+ # # βœ… Setup Google Sheets
351
+ # creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
352
+ # scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
353
+ # creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
354
+ # client = gspread.authorize(creds)
355
+ # spreadsheet = client.open("known_samples")
356
+ # sheet = spreadsheet.sheet1
357
+
358
+ # # βœ… Read existing data
359
+ # existing_data = sheet.get_all_values()
360
+ # headers = existing_data[0]
361
+
362
+ # if existing_data:
363
+ # df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
364
+
365
+ # else:
366
+
367
+ # df_old = pd.DataFrame(columns=[
368
+ # "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
369
+ # "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
370
+ # "Query_cost", "Sample Type Explanation", "Sources", "Time cost", "file_chunk", "file_all_output"
371
+ # ])
372
+
373
+
374
+ # # βœ… Index by Sample ID
375
+ # df_old.set_index("Sample ID", inplace=True)
376
+ # df_new.set_index("Sample ID", inplace=True)
377
+
378
+ # # βœ… Update only matching fields
379
+ # update_columns = [
380
+ # "Predicted Country", "Predicted Sample Type", "Country Explanation",
381
+ # "Sample Type Explanation", "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
382
+ # ]
383
+ # for idx, row in df_new.iterrows():
384
+ # if idx not in df_old.index:
385
+ # df_old.loc[idx] = "" # new row, fill empty first
386
+ # for col in update_columns:
387
+ # if pd.notna(row[col]) and row[col] != "":
388
+ # df_old.at[idx, col] = row[col]
389
+
390
+ # # βœ… Reset and write back
391
+ # EXPECTED_COLUMNS = [
392
+ # "Sample ID", "Predicted Country", "Country Explanation",
393
+ # "Predicted Sample Type", "Sample Type Explanation",
394
+ # "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
395
+ # ]
396
+
397
+ # # Force schema
398
+ # for col in EXPECTED_COLUMNS:
399
+ # if col not in df_old.columns:
400
+ # df_old[col] = ""
401
+
402
+ # df_old = df_old[EXPECTED_COLUMNS].reset_index(inplace=True) # reorder + drop unexpected
403
+
404
+ # # βœ… Safe update
405
+ # sheet.clear()
406
+ # sheet.update([EXPECTED_COLUMNS] + df_old.astype(str).values.tolist())
407
+
408
+ # # df_old.reset_index(inplace=True)
409
+ # # sheet.clear()
410
+ # # sheet.update([df_old.columns.values.tolist()] + df_old.values.tolist())
411
+ # print("βœ… Match results saved to known_samples.")
412
+
413
+ # except Exception as e:
414
+ # print(f"❌ Failed to update known_samples: {e}")
415
  try:
416
  # Prepare as DataFrame
417
  df_new = pd.DataFrame(save_rows, columns=[
 
419
  "Predicted Sample Type", "Sample Type Explanation",
420
  "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
421
  ])
422
+
423
  # βœ… Setup Google Sheets
424
  creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
425
  scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
 
427
  client = gspread.authorize(creds)
428
  spreadsheet = client.open("known_samples")
429
  sheet = spreadsheet.sheet1
430
+
431
+ # βœ… Load existing data
432
  existing_data = sheet.get_all_values()
433
+ headers = existing_data[0]
434
+ existing_df = pd.DataFrame(existing_data[1:], columns=headers)
435
+
436
+ # βœ… Build lookup: Sample ID β†’ row index
437
+ id_to_row = {sid: i+2 for i, sid in enumerate(existing_df["Sample ID"])}
438
+ # +2 because gspread is 1-based and row 1 is headers
439
+
440
+ for _, row in df_new.iterrows():
441
+ sid = row["Sample ID"]
442
+
443
+ # Row values in correct schema order
444
+ row_values = [
445
+ row.get("Sample ID", ""),
446
+ row.get("Predicted Country", ""),
447
+ row.get("Country Explanation", ""),
448
+ row.get("Predicted Sample Type", ""),
449
+ row.get("Sample Type Explanation", ""),
450
+ row.get("Sources", ""),
451
+ row.get("Query_cost", ""),
452
+ row.get("Time cost", ""),
453
+ row.get("file_chunk", ""),
454
+ row.get("file_all_output", "")
455
+ ]
456
+
457
+ if sid in id_to_row:
458
+ # βœ… Update existing row
459
+ sheet.update(f"A{id_to_row[sid]}:J{id_to_row[sid]}", [row_values])
460
+ else:
461
+ # βœ… Append new row
462
+ sheet.append_row(row_values)
463
+
464
+ print("βœ… Match results safely saved to known_samples.")
465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
  except Exception as e:
467
  print(f"❌ Failed to update known_samples: {e}")
468