Spaces:
Running
Running
Update mtdna_backend.py
Browse files- mtdna_backend.py +151 -52
mtdna_backend.py
CHANGED
|
@@ -151,6 +151,12 @@ def get_incomplete_accessions(file_path):
|
|
| 151 |
|
| 152 |
# GOOGLE_SHEET_NAME = "known_samples"
|
| 153 |
# USAGE_DRIVE_FILENAME = "user_usage_log.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
async def summarize_results(accession, stop_flag=None):
|
| 156 |
# Early bail
|
|
@@ -234,30 +240,52 @@ async def summarize_results(accession, stop_flag=None):
|
|
| 234 |
label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
|
| 235 |
else: label = key
|
| 236 |
if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
row = {
|
| 238 |
-
"Sample ID": label or "unknown",
|
| 239 |
-
"Predicted Country": pred_country or "unknown",
|
| 240 |
-
"Country Explanation": country_explanation or "unknown",
|
| 241 |
-
"Predicted Sample Type":pred_sample or "unknown",
|
| 242 |
-
"Sample Type Explanation":sample_explanation or "unknown",
|
| 243 |
-
"Sources": "\n".join(outputs[key]["source"]) or "No Links",
|
| 244 |
-
"Time cost": outputs[key]["time_cost"]
|
| 245 |
}
|
| 246 |
#row_score.append(row)
|
| 247 |
rows.append(list(row.values()))
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
save_row = {
|
| 250 |
-
"Sample ID": label or "unknown",
|
| 251 |
-
"Predicted Country": pred_country or "unknown",
|
| 252 |
-
"Country Explanation": country_explanation or "unknown",
|
| 253 |
-
"Predicted Sample Type":pred_sample or "unknown",
|
| 254 |
-
"Sample Type Explanation":sample_explanation or "unknown",
|
| 255 |
-
"Sources": "\n".join(outputs[key]["source"]) or "No Links",
|
| 256 |
"Query_cost": outputs[key]["query_cost"] or "",
|
| 257 |
"Time cost": outputs[key]["time_cost"] or "",
|
| 258 |
-
"file_chunk":outputs[key]["file_chunk"] or "",
|
| 259 |
-
"file_all_output":outputs[key]["file_all_output"] or ""
|
| 260 |
}
|
|
|
|
| 261 |
#row_score.append(row)
|
| 262 |
save_rows.append(list(save_row.values()))
|
| 263 |
|
|
@@ -311,6 +339,79 @@ async def summarize_results(accession, stop_flag=None):
|
|
| 311 |
|
| 312 |
# except Exception as e:
|
| 313 |
# print(f"β οΈ Failed to save known output to Google Sheets: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
try:
|
| 315 |
# Prepare as DataFrame
|
| 316 |
df_new = pd.DataFrame(save_rows, columns=[
|
|
@@ -318,7 +419,7 @@ async def summarize_results(accession, stop_flag=None):
|
|
| 318 |
"Predicted Sample Type", "Sample Type Explanation",
|
| 319 |
"Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
|
| 320 |
])
|
| 321 |
-
|
| 322 |
# β
Setup Google Sheets
|
| 323 |
creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
|
| 324 |
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
|
|
@@ -326,44 +427,42 @@ async def summarize_results(accession, stop_flag=None):
|
|
| 326 |
client = gspread.authorize(creds)
|
| 327 |
spreadsheet = client.open("known_samples")
|
| 328 |
sheet = spreadsheet.sheet1
|
| 329 |
-
|
| 330 |
-
# β
|
| 331 |
existing_data = sheet.get_all_values()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
-
if existing_data:
|
| 334 |
-
df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
|
| 335 |
-
|
| 336 |
-
else:
|
| 337 |
-
|
| 338 |
-
df_old = pd.DataFrame(columns=[
|
| 339 |
-
"Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
|
| 340 |
-
"Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
|
| 341 |
-
"Query_cost", "Sample Type Explanation", "Sources", "Time cost", "file_chunk", "file_all_output"
|
| 342 |
-
])
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
# β
Index by Sample ID
|
| 346 |
-
df_old.set_index("Sample ID", inplace=True)
|
| 347 |
-
df_new.set_index("Sample ID", inplace=True)
|
| 348 |
-
|
| 349 |
-
# β
Update only matching fields
|
| 350 |
-
update_columns = [
|
| 351 |
-
"Predicted Country", "Predicted Sample Type", "Country Explanation",
|
| 352 |
-
"Sample Type Explanation", "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
|
| 353 |
-
]
|
| 354 |
-
for idx, row in df_new.iterrows():
|
| 355 |
-
if idx not in df_old.index:
|
| 356 |
-
df_old.loc[idx] = "" # new row, fill empty first
|
| 357 |
-
for col in update_columns:
|
| 358 |
-
if pd.notna(row[col]) and row[col] != "":
|
| 359 |
-
df_old.at[idx, col] = row[col]
|
| 360 |
-
|
| 361 |
-
# β
Reset and write back
|
| 362 |
-
df_old.reset_index(inplace=True)
|
| 363 |
-
sheet.clear()
|
| 364 |
-
sheet.update([df_old.columns.values.tolist()] + df_old.values.tolist())
|
| 365 |
-
print("β
Match results saved to known_samples.")
|
| 366 |
-
|
| 367 |
except Exception as e:
|
| 368 |
print(f"β Failed to update known_samples: {e}")
|
| 369 |
|
|
|
|
| 151 |
|
| 152 |
# GOOGLE_SHEET_NAME = "known_samples"
|
| 153 |
# USAGE_DRIVE_FILENAME = "user_usage_log.json"
|
| 154 |
+
def truncate_cell(value, max_len=49000):
|
| 155 |
+
"""Ensure cell content never exceeds Google Sheets 50k char limit."""
|
| 156 |
+
if not isinstance(value, str):
|
| 157 |
+
value = str(value)
|
| 158 |
+
return value[:max_len] + ("... [TRUNCATED]" if len(value) > max_len else "")
|
| 159 |
+
|
| 160 |
|
| 161 |
async def summarize_results(accession, stop_flag=None):
|
| 162 |
# Early bail
|
|
|
|
| 240 |
label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
|
| 241 |
else: label = key
|
| 242 |
if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
|
| 243 |
+
# row = {
|
| 244 |
+
# "Sample ID": label or "unknown",
|
| 245 |
+
# "Predicted Country": pred_country or "unknown",
|
| 246 |
+
# "Country Explanation": country_explanation or "unknown",
|
| 247 |
+
# "Predicted Sample Type":pred_sample or "unknown",
|
| 248 |
+
# "Sample Type Explanation":sample_explanation or "unknown",
|
| 249 |
+
# "Sources": "\n".join(outputs[key]["source"]) or "No Links",
|
| 250 |
+
# "Time cost": outputs[key]["time_cost"]
|
| 251 |
+
# }
|
| 252 |
row = {
|
| 253 |
+
"Sample ID": truncate_cell(label or "unknown"),
|
| 254 |
+
"Predicted Country": truncate_cell(pred_country or "unknown"),
|
| 255 |
+
"Country Explanation": truncate_cell(country_explanation or "unknown"),
|
| 256 |
+
"Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
|
| 257 |
+
"Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
|
| 258 |
+
"Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
|
| 259 |
+
"Time cost": truncate_cell(outputs[key]["time_cost"])
|
| 260 |
}
|
| 261 |
#row_score.append(row)
|
| 262 |
rows.append(list(row.values()))
|
| 263 |
|
| 264 |
+
# save_row = {
|
| 265 |
+
# "Sample ID": label or "unknown",
|
| 266 |
+
# "Predicted Country": pred_country or "unknown",
|
| 267 |
+
# "Country Explanation": country_explanation or "unknown",
|
| 268 |
+
# "Predicted Sample Type":pred_sample or "unknown",
|
| 269 |
+
# "Sample Type Explanation":sample_explanation or "unknown",
|
| 270 |
+
# "Sources": "\n".join(outputs[key]["source"]) or "No Links",
|
| 271 |
+
# "Query_cost": outputs[key]["query_cost"] or "",
|
| 272 |
+
# "Time cost": outputs[key]["time_cost"] or "",
|
| 273 |
+
# "file_chunk":outputs[key]["file_chunk"] or "",
|
| 274 |
+
# "file_all_output":outputs[key]["file_all_output"] or ""
|
| 275 |
+
# }
|
| 276 |
save_row = {
|
| 277 |
+
"Sample ID": truncate_cell(label or "unknown"),
|
| 278 |
+
"Predicted Country": truncate_cell(pred_country or "unknown"),
|
| 279 |
+
"Country Explanation": truncate_cell(country_explanation or "unknown"),
|
| 280 |
+
"Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
|
| 281 |
+
"Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
|
| 282 |
+
"Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
|
| 283 |
"Query_cost": outputs[key]["query_cost"] or "",
|
| 284 |
"Time cost": outputs[key]["time_cost"] or "",
|
| 285 |
+
"file_chunk": truncate_cell(outputs[key]["file_chunk"] or ""),
|
| 286 |
+
"file_all_output": truncate_cell(outputs[key]["file_all_output"] or "")
|
| 287 |
}
|
| 288 |
+
|
| 289 |
#row_score.append(row)
|
| 290 |
save_rows.append(list(save_row.values()))
|
| 291 |
|
|
|
|
| 339 |
|
| 340 |
# except Exception as e:
|
| 341 |
# print(f"β οΈ Failed to save known output to Google Sheets: {e}")
|
| 342 |
+
# try:
|
| 343 |
+
# # Prepare as DataFrame
|
| 344 |
+
# df_new = pd.DataFrame(save_rows, columns=[
|
| 345 |
+
# "Sample ID", "Predicted Country", "Country Explanation",
|
| 346 |
+
# "Predicted Sample Type", "Sample Type Explanation",
|
| 347 |
+
# "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
|
| 348 |
+
# ])
|
| 349 |
+
|
| 350 |
+
# # β
Setup Google Sheets
|
| 351 |
+
# creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
|
| 352 |
+
# scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
|
| 353 |
+
# creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
|
| 354 |
+
# client = gspread.authorize(creds)
|
| 355 |
+
# spreadsheet = client.open("known_samples")
|
| 356 |
+
# sheet = spreadsheet.sheet1
|
| 357 |
+
|
| 358 |
+
# # β
Read existing data
|
| 359 |
+
# existing_data = sheet.get_all_values()
|
| 360 |
+
# headers = existing_data[0]
|
| 361 |
+
|
| 362 |
+
# if existing_data:
|
| 363 |
+
# df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
|
| 364 |
+
|
| 365 |
+
# else:
|
| 366 |
+
|
| 367 |
+
# df_old = pd.DataFrame(columns=[
|
| 368 |
+
# "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
|
| 369 |
+
# "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
|
| 370 |
+
# "Query_cost", "Sample Type Explanation", "Sources", "Time cost", "file_chunk", "file_all_output"
|
| 371 |
+
# ])
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
# # β
Index by Sample ID
|
| 375 |
+
# df_old.set_index("Sample ID", inplace=True)
|
| 376 |
+
# df_new.set_index("Sample ID", inplace=True)
|
| 377 |
+
|
| 378 |
+
# # β
Update only matching fields
|
| 379 |
+
# update_columns = [
|
| 380 |
+
# "Predicted Country", "Predicted Sample Type", "Country Explanation",
|
| 381 |
+
# "Sample Type Explanation", "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
|
| 382 |
+
# ]
|
| 383 |
+
# for idx, row in df_new.iterrows():
|
| 384 |
+
# if idx not in df_old.index:
|
| 385 |
+
# df_old.loc[idx] = "" # new row, fill empty first
|
| 386 |
+
# for col in update_columns:
|
| 387 |
+
# if pd.notna(row[col]) and row[col] != "":
|
| 388 |
+
# df_old.at[idx, col] = row[col]
|
| 389 |
+
|
| 390 |
+
# # β
Reset and write back
|
| 391 |
+
# EXPECTED_COLUMNS = [
|
| 392 |
+
# "Sample ID", "Predicted Country", "Country Explanation",
|
| 393 |
+
# "Predicted Sample Type", "Sample Type Explanation",
|
| 394 |
+
# "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
|
| 395 |
+
# ]
|
| 396 |
+
|
| 397 |
+
# # Force schema
|
| 398 |
+
# for col in EXPECTED_COLUMNS:
|
| 399 |
+
# if col not in df_old.columns:
|
| 400 |
+
# df_old[col] = ""
|
| 401 |
+
|
| 402 |
+
# df_old = df_old[EXPECTED_COLUMNS].reset_index(inplace=True) # reorder + drop unexpected
|
| 403 |
+
|
| 404 |
+
# # β
Safe update
|
| 405 |
+
# sheet.clear()
|
| 406 |
+
# sheet.update([EXPECTED_COLUMNS] + df_old.astype(str).values.tolist())
|
| 407 |
+
|
| 408 |
+
# # df_old.reset_index(inplace=True)
|
| 409 |
+
# # sheet.clear()
|
| 410 |
+
# # sheet.update([df_old.columns.values.tolist()] + df_old.values.tolist())
|
| 411 |
+
# print("β
Match results saved to known_samples.")
|
| 412 |
+
|
| 413 |
+
# except Exception as e:
|
| 414 |
+
# print(f"β Failed to update known_samples: {e}")
|
| 415 |
try:
|
| 416 |
# Prepare as DataFrame
|
| 417 |
df_new = pd.DataFrame(save_rows, columns=[
|
|
|
|
| 419 |
"Predicted Sample Type", "Sample Type Explanation",
|
| 420 |
"Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
|
| 421 |
])
|
| 422 |
+
|
| 423 |
# β
Setup Google Sheets
|
| 424 |
creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
|
| 425 |
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
|
|
|
|
| 427 |
client = gspread.authorize(creds)
|
| 428 |
spreadsheet = client.open("known_samples")
|
| 429 |
sheet = spreadsheet.sheet1
|
| 430 |
+
|
| 431 |
+
# β
Load existing data
|
| 432 |
existing_data = sheet.get_all_values()
|
| 433 |
+
headers = existing_data[0]
|
| 434 |
+
existing_df = pd.DataFrame(existing_data[1:], columns=headers)
|
| 435 |
+
|
| 436 |
+
# β
Build lookup: Sample ID β row index
|
| 437 |
+
id_to_row = {sid: i+2 for i, sid in enumerate(existing_df["Sample ID"])}
|
| 438 |
+
# +2 because gspread is 1-based and row 1 is headers
|
| 439 |
+
|
| 440 |
+
for _, row in df_new.iterrows():
|
| 441 |
+
sid = row["Sample ID"]
|
| 442 |
+
|
| 443 |
+
# Row values in correct schema order
|
| 444 |
+
row_values = [
|
| 445 |
+
row.get("Sample ID", ""),
|
| 446 |
+
row.get("Predicted Country", ""),
|
| 447 |
+
row.get("Country Explanation", ""),
|
| 448 |
+
row.get("Predicted Sample Type", ""),
|
| 449 |
+
row.get("Sample Type Explanation", ""),
|
| 450 |
+
row.get("Sources", ""),
|
| 451 |
+
row.get("Query_cost", ""),
|
| 452 |
+
row.get("Time cost", ""),
|
| 453 |
+
row.get("file_chunk", ""),
|
| 454 |
+
row.get("file_all_output", "")
|
| 455 |
+
]
|
| 456 |
+
|
| 457 |
+
if sid in id_to_row:
|
| 458 |
+
# β
Update existing row
|
| 459 |
+
sheet.update(f"A{id_to_row[sid]}:J{id_to_row[sid]}", [row_values])
|
| 460 |
+
else:
|
| 461 |
+
# β
Append new row
|
| 462 |
+
sheet.append_row(row_values)
|
| 463 |
+
|
| 464 |
+
print("β
Match results safely saved to known_samples.")
|
| 465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
except Exception as e:
|
| 467 |
print(f"β Failed to update known_samples: {e}")
|
| 468 |
|