Spaces:

VyLala
/

BioMetadataAudit

Running

App Files Files Community

VyLala commited on Sep 22, 2025

Commit

46054b9

verified ·

1 Parent(s): 792f983

Update mtdna_backend.py

Browse files

Files changed (1) hide show

mtdna_backend.py +151 -52

mtdna_backend.py CHANGED Viewed

@@ -151,6 +151,12 @@ def get_incomplete_accessions(file_path):
 # GOOGLE_SHEET_NAME = "known_samples"
 # USAGE_DRIVE_FILENAME = "user_usage_log.json"
 async def summarize_results(accession, stop_flag=None):
     # Early bail
@@ -234,30 +240,52 @@ async def summarize_results(accession, stop_flag=None):
             label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
           else: label = key
         if len(outputs[key]["source"]) == 0:  outputs[key]["source"] = ["No Links"]
         row = {
-            "Sample ID": label or "unknown",
-            "Predicted Country": pred_country or "unknown",
-            "Country Explanation": country_explanation or "unknown",
-            "Predicted Sample Type":pred_sample or "unknown",
-            "Sample Type Explanation":sample_explanation or "unknown",
-            "Sources": "\n".join(outputs[key]["source"]) or "No Links",
-            "Time cost": outputs[key]["time_cost"]
         }
         #row_score.append(row)
         rows.append(list(row.values()))
         save_row = {
-            "Sample ID": label or "unknown",
-            "Predicted Country": pred_country or "unknown",
-            "Country Explanation": country_explanation or "unknown",
-            "Predicted Sample Type":pred_sample or "unknown",
-            "Sample Type Explanation":sample_explanation or "unknown",
-            "Sources": "\n".join(outputs[key]["source"]) or "No Links",
             "Query_cost": outputs[key]["query_cost"] or "",
             "Time cost": outputs[key]["time_cost"] or "",
-            "file_chunk":outputs[key]["file_chunk"] or "",
-            "file_all_output":outputs[key]["file_all_output"] or ""
         }
         #row_score.append(row)
         save_rows.append(list(save_row.values()))
@@ -311,6 +339,79 @@ async def summarize_results(accession, stop_flag=None):
     # except Exception as e:
     #     print(f"⚠️ Failed to save known output to Google Sheets: {e}")
     try:
         # Prepare as DataFrame
         df_new = pd.DataFrame(save_rows, columns=[
@@ -318,7 +419,7 @@ async def summarize_results(accession, stop_flag=None):
             "Predicted Sample Type", "Sample Type Explanation",
             "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
         ])
         # ✅ Setup Google Sheets
         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
@@ -326,44 +427,42 @@ async def summarize_results(accession, stop_flag=None):
         client = gspread.authorize(creds)
         spreadsheet = client.open("known_samples")
         sheet = spreadsheet.sheet1
-        # ✅ Read existing data
         existing_data = sheet.get_all_values()
-        if existing_data:
-            df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
-        else:
-            df_old = pd.DataFrame(columns=[
-                "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
-                "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
-                "Query_cost", "Sample Type Explanation", "Sources", "Time cost", "file_chunk", "file_all_output"
-            ])
-        # ✅ Index by Sample ID
-        df_old.set_index("Sample ID", inplace=True)
-        df_new.set_index("Sample ID", inplace=True)
-        # ✅ Update only matching fields
-        update_columns = [
-            "Predicted Country", "Predicted Sample Type", "Country Explanation",
-            "Sample Type Explanation", "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
-        ]
-        for idx, row in df_new.iterrows():
-            if idx not in df_old.index:
-                df_old.loc[idx] = ""  # new row, fill empty first
-            for col in update_columns:
-                if pd.notna(row[col]) and row[col] != "":
-                    df_old.at[idx, col] = row[col]
-        # ✅ Reset and write back
-        df_old.reset_index(inplace=True)
-        sheet.clear()
-        sheet.update([df_old.columns.values.tolist()] + df_old.values.tolist())
-        print("✅ Match results saved to known_samples.")
     except Exception as e:
         print(f"❌ Failed to update known_samples: {e}")

 # GOOGLE_SHEET_NAME = "known_samples"
 # USAGE_DRIVE_FILENAME = "user_usage_log.json"
+def truncate_cell(value, max_len=49000):
+    """Ensure cell content never exceeds Google Sheets 50k char limit."""
+    if not isinstance(value, str):
+        value = str(value)
+    return value[:max_len] + ("... [TRUNCATED]" if len(value) > max_len else "")
 async def summarize_results(accession, stop_flag=None):
     # Early bail
             label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
           else: label = key
         if len(outputs[key]["source"]) == 0:  outputs[key]["source"] = ["No Links"]
+        # row = {
+        #     "Sample ID": label or "unknown",
+        #     "Predicted Country": pred_country or "unknown",
+        #     "Country Explanation": country_explanation or "unknown",
+        #     "Predicted Sample Type":pred_sample or "unknown",
+        #     "Sample Type Explanation":sample_explanation or "unknown",
+        #     "Sources": "\n".join(outputs[key]["source"]) or "No Links",
+        #     "Time cost": outputs[key]["time_cost"]
+        # }
         row = {
+            "Sample ID": truncate_cell(label or "unknown"),
+            "Predicted Country": truncate_cell(pred_country or "unknown"),
+            "Country Explanation": truncate_cell(country_explanation or "unknown"),
+            "Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
+            "Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
+            "Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
+            "Time cost": truncate_cell(outputs[key]["time_cost"])
         }
         #row_score.append(row)
         rows.append(list(row.values()))
+        # save_row = {
+        #     "Sample ID": label or "unknown",
+        #     "Predicted Country": pred_country or "unknown",
+        #     "Country Explanation": country_explanation or "unknown",
+        #     "Predicted Sample Type":pred_sample or "unknown",
+        #     "Sample Type Explanation":sample_explanation or "unknown",
+        #     "Sources": "\n".join(outputs[key]["source"]) or "No Links",
+        #     "Query_cost": outputs[key]["query_cost"] or "",
+        #     "Time cost": outputs[key]["time_cost"] or "",
+        #     "file_chunk":outputs[key]["file_chunk"] or "",
+        #     "file_all_output":outputs[key]["file_all_output"] or ""
+        # }
         save_row = {
+            "Sample ID": truncate_cell(label or "unknown"),
+            "Predicted Country": truncate_cell(pred_country or "unknown"),
+            "Country Explanation": truncate_cell(country_explanation or "unknown"),
+            "Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
+            "Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
+            "Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
             "Query_cost": outputs[key]["query_cost"] or "",
             "Time cost": outputs[key]["time_cost"] or "",
+            "file_chunk": truncate_cell(outputs[key]["file_chunk"] or ""),
+            "file_all_output": truncate_cell(outputs[key]["file_all_output"] or "")
         }
         #row_score.append(row)
         save_rows.append(list(save_row.values()))
     # except Exception as e:
     #     print(f"⚠️ Failed to save known output to Google Sheets: {e}")
+    # try:
+    #     # Prepare as DataFrame
+    #     df_new = pd.DataFrame(save_rows, columns=[
+    #         "Sample ID", "Predicted Country", "Country Explanation",
+    #         "Predicted Sample Type", "Sample Type Explanation",
+    #         "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
+    #     ])
+    #     # ✅ Setup Google Sheets
+    #     creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+    #     scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+    #     creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+    #     client = gspread.authorize(creds)
+    #     spreadsheet = client.open("known_samples")
+    #     sheet = spreadsheet.sheet1
+    #     # ✅ Read existing data
+    #     existing_data = sheet.get_all_values()
+    #     headers = existing_data[0]
+    #     if existing_data:
+    #         df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
+    #     else:
+    #         df_old = pd.DataFrame(columns=[
+    #             "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
+    #             "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
+    #             "Query_cost", "Sample Type Explanation", "Sources", "Time cost", "file_chunk", "file_all_output"
+    #         ])
+    #     # ✅ Index by Sample ID
+    #     df_old.set_index("Sample ID", inplace=True)
+    #     df_new.set_index("Sample ID", inplace=True)
+    #     # ✅ Update only matching fields
+    #     update_columns = [
+    #         "Predicted Country", "Predicted Sample Type", "Country Explanation",
+    #         "Sample Type Explanation", "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
+    #     ]
+    #     for idx, row in df_new.iterrows():
+    #         if idx not in df_old.index:
+    #             df_old.loc[idx] = ""  # new row, fill empty first
+    #         for col in update_columns:
+    #             if pd.notna(row[col]) and row[col] != "":
+    #                 df_old.at[idx, col] = row[col]
+    #     # ✅ Reset and write back
+    #     EXPECTED_COLUMNS = [
+    #     "Sample ID", "Predicted Country", "Country Explanation",
+    #     "Predicted Sample Type", "Sample Type Explanation",
+    #     "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
+    #     ]
+    #     # Force schema
+    #     for col in EXPECTED_COLUMNS:
+    #         if col not in df_old.columns:
+    #             df_old[col] = ""
+    #     df_old = df_old[EXPECTED_COLUMNS].reset_index(inplace=True)  # reorder + drop unexpected
+    #     # ✅ Safe update
+    #     sheet.clear()
+    #     sheet.update([EXPECTED_COLUMNS] + df_old.astype(str).values.tolist())
+    #     # df_old.reset_index(inplace=True)
+    #     # sheet.clear()
+    #     # sheet.update([df_old.columns.values.tolist()] + df_old.values.tolist())
+    #     print("✅ Match results saved to known_samples.")
+    # except Exception as e:
+    #     print(f"❌ Failed to update known_samples: {e}")
     try:
         # Prepare as DataFrame
         df_new = pd.DataFrame(save_rows, columns=[
             "Predicted Sample Type", "Sample Type Explanation",
             "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
         ])
         # ✅ Setup Google Sheets
         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
         client = gspread.authorize(creds)
         spreadsheet = client.open("known_samples")
         sheet = spreadsheet.sheet1
+        # ✅ Load existing data
         existing_data = sheet.get_all_values()
+        headers = existing_data[0]
+        existing_df = pd.DataFrame(existing_data[1:], columns=headers)
+        # ✅ Build lookup: Sample ID → row index
+        id_to_row = {sid: i+2 for i, sid in enumerate(existing_df["Sample ID"])}
+        # +2 because gspread is 1-based and row 1 is headers
+        for _, row in df_new.iterrows():
+            sid = row["Sample ID"]
+            # Row values in correct schema order
+            row_values = [
+                row.get("Sample ID", ""),
+                row.get("Predicted Country", ""),
+                row.get("Country Explanation", ""),
+                row.get("Predicted Sample Type", ""),
+                row.get("Sample Type Explanation", ""),
+                row.get("Sources", ""),
+                row.get("Query_cost", ""),
+                row.get("Time cost", ""),
+                row.get("file_chunk", ""),
+                row.get("file_all_output", "")
+            ]
+            if sid in id_to_row:
+                # ✅ Update existing row
+                sheet.update(f"A{id_to_row[sid]}:J{id_to_row[sid]}", [row_values])
+            else:
+                # ✅ Append new row
+                sheet.append_row(row_values)
+        print("✅ Match results safely saved to known_samples.")
     except Exception as e:
         print(f"❌ Failed to update known_samples: {e}")