Spaces:

uc-ctds
/

GDC-QAG

Sleeping

App Files Files Community

Michael commited on Aug 13, 2025

Commit

9b4901c

1 Parent(s): efc14ff

remove gpustat calls

Browse files

Files changed (3) hide show

app.py +21 -14
gdc_pipeline.py +20 -21
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -18,7 +18,6 @@ from transformers import (
 from methods import gdc_api_calls, utilities
 # set up various tokens
 hf_TOKEN = os.environ.get("hf_svc_ctds", False)
@@ -123,7 +122,11 @@ def infer_user_intent(query):
 # function to combine entities, intent and API call
 def construct_and_execute_api_call(query):
-    print("\nStep 1: Starting GDC-QAG on input natural language query:\n{}\n".format(query))
     # Infer entities
     initial_cancer_entities = utilities.return_initial_cancer_entities(
         query, model="en_ner_bc5cdr_md"
@@ -151,7 +154,7 @@ def construct_and_execute_api_call(query):
         query=query,
         gdc_genes_mutations=gdc_genes_mutations,
     )
-    print('\nStep 2: Entity Extraction\n')
     print("gene entities {}".format(gene_entities))
     print("mutation entities {}".format(mutation_entities))
     print("cancer entities {}".format(cancer_entities))
@@ -160,7 +163,7 @@ def construct_and_execute_api_call(query):
     intent = infer_user_intent(query)
     print("\nStep 3: Intent Inference:\n{}\n".format(intent))
     try:
-        print('\nStep 4: API call builder for intent {}\n'.format(intent))
         api_call_result, cancer_entities = execute_api_call(
             intent, gene_entities, mutation_entities, cancer_entities, query
         )
@@ -230,19 +233,19 @@ def get_prefinal_response(row):
         helper_output = row["helper_output"]
     except Exception as e:
         print(f"unable to retrieve query: {query} or helper_output: {helper_output}")
-    print('\nStep 6: Augment LLM prompt for llama-3B\n')
     modified_query = utilities.construct_modified_query(query, helper_output)
-    print('{}'.format(modified_query))
-    print('\nStep 7: Generate LLM response R on query augmented prompt\n')
     prefinal_llama_with_helper_output = generate_response(modified_query)
-    print('{}'.format(prefinal_llama_with_helper_output))
     return pd.Series([modified_query, prefinal_llama_with_helper_output])
 def execute_pipeline(question: str):
     df = pd.DataFrame({"questions": [question]})
     print(f"\n\nQuestion received: {question}\n")
-    print("CUDA device name:", torch.cuda.get_device_name(0))
     # queries input file
     df[
@@ -295,11 +298,13 @@ def execute_pipeline(question: str):
     )
     result.index = ["GDC-QAG results"] * len(result)
-    print('Query Augmented Generation final response {}'.format(
-        '\n'.join(result['Query augmented generation'].astype(str))
-    ))
     print("completed")
     print("\nWriting result string now\n")
     result = result.T.to_dict()
@@ -309,7 +314,9 @@ def execute_pipeline(question: str):
     result_string += f"Question: {result['GDC-QAG results']['Question']}\n"
     result_string += f"llama-3B baseline output: {result['GDC-QAG results']['llama-3B baseline frequency']}%\n"
-    result_string += f"Query augmented prompt: {result['GDC-QAG results']['Query augmented prompt']}"
     result_string += f"Query augmented generation: {result['GDC-QAG results']['Query augmented generation']}"
     return result_string

 from methods import gdc_api_calls, utilities
 # set up various tokens
 hf_TOKEN = os.environ.get("hf_svc_ctds", False)
 # function to combine entities, intent and API call
 def construct_and_execute_api_call(query):
+    print(
+        "\nStep 1: Starting GDC-QAG on input natural language query:\n{}\n".format(
+            query
+        )
+    )
     # Infer entities
     initial_cancer_entities = utilities.return_initial_cancer_entities(
         query, model="en_ner_bc5cdr_md"
         query=query,
         gdc_genes_mutations=gdc_genes_mutations,
     )
+    print("\nStep 2: Entity Extraction\n")
     print("gene entities {}".format(gene_entities))
     print("mutation entities {}".format(mutation_entities))
     print("cancer entities {}".format(cancer_entities))
     intent = infer_user_intent(query)
     print("\nStep 3: Intent Inference:\n{}\n".format(intent))
     try:
+        print("\nStep 4: API call builder for intent {}\n".format(intent))
         api_call_result, cancer_entities = execute_api_call(
             intent, gene_entities, mutation_entities, cancer_entities, query
         )
         helper_output = row["helper_output"]
     except Exception as e:
         print(f"unable to retrieve query: {query} or helper_output: {helper_output}")
+    print("\nStep 6: Augment LLM prompt for llama-3B\n")
     modified_query = utilities.construct_modified_query(query, helper_output)
+    print("{}".format(modified_query))
+    print("\nStep 7: Generate LLM response R on query augmented prompt\n")
     prefinal_llama_with_helper_output = generate_response(modified_query)
+    print("{}".format(prefinal_llama_with_helper_output))
     return pd.Series([modified_query, prefinal_llama_with_helper_output])
 def execute_pipeline(question: str):
     df = pd.DataFrame({"questions": [question]})
     print(f"\n\nQuestion received: {question}\n")
+    # print("CUDA device name:", torch.cuda.get_device_name(0))
     # queries input file
     df[
     )
     result.index = ["GDC-QAG results"] * len(result)
+    print(
+        "Query Augmented Generation final response {}".format(
+            "\n".join(result["Query augmented generation"].astype(str))
+        )
+    )
     print("completed")
     print("\nWriting result string now\n")
     result = result.T.to_dict()
     result_string += f"Question: {result['GDC-QAG results']['Question']}\n"
     result_string += f"llama-3B baseline output: {result['GDC-QAG results']['llama-3B baseline frequency']}%\n"
+    result_string += (
+        f"Query augmented prompt: {result['GDC-QAG results']['Query augmented prompt']}"
+    )
     result_string += f"Query augmented generation: {result['GDC-QAG results']['Query augmented generation']}"
     return result_string

gdc_pipeline.py CHANGED Viewed

@@ -151,13 +151,7 @@ def generate_response(modified_query, model, tok):
 def batch_test(
-    query,
-    model,
-    tok,
-    gdc_genes_mutations,
-    project_mappings,
-    intent_model,
-    intent_tok
 ):
     modified_query = utilities.construct_modified_query_base_llm(query)
     print(f"modified_query is: {modified_query}")
@@ -229,7 +223,7 @@ def setup_models_and_data(AUTH_TOKEN, llama_token, intent_token):
     # retrieve and load GDC project mappings
     project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
-    print('loading intent model')
     intent_model, intent_tok = utilities.load_intent_model_hf(intent_token)
     print("loading gdc genes and mutations")
@@ -244,19 +238,24 @@ def setup_models_and_data(AUTH_TOKEN, llama_token, intent_token):
         model=model,
         tok=tok,
         intent_model=intent_model,
-        intent_tok=intent_tok
     )
 @utilities.timeit
 def execute_pipeline(
-    df, gdc_genes_mutations, model,
-    tok, intent_model, intent_tok,
-    project_mappings, output_file_prefix
 ):
     print("starting pipeline")
-    print("CUDA available:", torch.cuda.is_available())
-    print("CUDA device name:", torch.cuda.get_device_name(0))
     # queries input file
     print(f"running test on input {df}")
@@ -277,7 +276,7 @@ def execute_pipeline(
             gdc_genes_mutations,
             project_mappings,
             intent_model,
-            intent_tok
         )
     )
@@ -326,8 +325,8 @@ def execute_pipeline(
         result = df_filtered_exploded
     else:
         result = df_filtered_exploded[final_columns].T
-        print('result {}'.format(result))
-    print('completed')
     return result
@@ -342,26 +341,26 @@ def main():
         df = pd.read_csv(input_file)
         output_file_prefix = os.path.basename(input_file).split(".")[0]
         execute_pipeline(
-            df,
             qag_requirements.gdc_genes_mutations,
             qag_requirements.model,
             qag_requirements.tok,
             qag_requirements.intent_model,
             qag_requirements.intent_tok,
             qag_requirements.project_mappings,
-            output_file_prefix
         )
     elif question:
         df = pd.DataFrame({"questions": [question]})
         execute_pipeline(
-            df,
             qag_requirements.gdc_genes_mutations,
             qag_requirements.model,
             qag_requirements.tok,
             qag_requirements.intent_model,
             qag_requirements.intent_tok,
             qag_requirements.project_mappings,
-            output_file_prefix=None
         )

 def batch_test(
+    query, model, tok, gdc_genes_mutations, project_mappings, intent_model, intent_tok
 ):
     modified_query = utilities.construct_modified_query_base_llm(query)
     print(f"modified_query is: {modified_query}")
     # retrieve and load GDC project mappings
     project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
+    print("loading intent model")
     intent_model, intent_tok = utilities.load_intent_model_hf(intent_token)
     print("loading gdc genes and mutations")
         model=model,
         tok=tok,
         intent_model=intent_model,
+        intent_tok=intent_tok,
     )
 @utilities.timeit
 def execute_pipeline(
+    df,
+    gdc_genes_mutations,
+    model,
+    tok,
+    intent_model,
+    intent_tok,
+    project_mappings,
+    output_file_prefix,
 ):
     print("starting pipeline")
+    # print("CUDA available:", torch.cuda.is_available())
+    # print("CUDA device name:", torch.cuda.get_device_name(0))
     # queries input file
     print(f"running test on input {df}")
             gdc_genes_mutations,
             project_mappings,
             intent_model,
+            intent_tok,
         )
     )
         result = df_filtered_exploded
     else:
         result = df_filtered_exploded[final_columns].T
+        print("result {}".format(result))
+    print("completed")
     return result
         df = pd.read_csv(input_file)
         output_file_prefix = os.path.basename(input_file).split(".")[0]
         execute_pipeline(
+            df,
             qag_requirements.gdc_genes_mutations,
             qag_requirements.model,
             qag_requirements.tok,
             qag_requirements.intent_model,
             qag_requirements.intent_tok,
             qag_requirements.project_mappings,
+            output_file_prefix,
         )
     elif question:
         df = pd.DataFrame({"questions": [question]})
         execute_pipeline(
+            df,
             qag_requirements.gdc_genes_mutations,
             qag_requirements.model,
             qag_requirements.tok,
             qag_requirements.intent_model,
             qag_requirements.intent_tok,
             qag_requirements.project_mappings,
+            output_file_prefix=None,
         )

requirements.txt CHANGED Viewed

@@ -1,7 +1,6 @@
 accelerate (>=1.9.0,<2.0.0)
 en-core-sci-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz#sha256=7c8fc52542dd1452ffce00b045c1298e2c185b7cf84793f8e0ec941987c09808
 en-ner-bc5cdr-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz#sha256=ffc73130a710edf851206199720cb2c744a043e032f5da6ba4bb36863deca778
-gpustat
 gradio (>=5.35.0,<6.0.0)
 guidance (>=0.2.4,<0.3.0)
 huggingface-hub (>=0.33.2,<0.34.0)

 accelerate (>=1.9.0,<2.0.0)
 en-core-sci-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz#sha256=7c8fc52542dd1452ffce00b045c1298e2c185b7cf84793f8e0ec941987c09808
 en-ner-bc5cdr-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz#sha256=ffc73130a710edf851206199720cb2c744a043e032f5da6ba4bb36863deca778
 gradio (>=5.35.0,<6.0.0)
 guidance (>=0.2.4,<0.3.0)
 huggingface-hub (>=0.33.2,<0.34.0)