Michael commited on
Commit
9b4901c
·
1 Parent(s): efc14ff

remove gpustat calls

Browse files
Files changed (3) hide show
  1. app.py +21 -14
  2. gdc_pipeline.py +20 -21
  3. requirements.txt +0 -1
app.py CHANGED
@@ -18,7 +18,6 @@ from transformers import (
18
 
19
  from methods import gdc_api_calls, utilities
20
 
21
-
22
  # set up various tokens
23
  hf_TOKEN = os.environ.get("hf_svc_ctds", False)
24
 
@@ -123,7 +122,11 @@ def infer_user_intent(query):
123
 
124
  # function to combine entities, intent and API call
125
  def construct_and_execute_api_call(query):
126
- print("\nStep 1: Starting GDC-QAG on input natural language query:\n{}\n".format(query))
 
 
 
 
127
  # Infer entities
128
  initial_cancer_entities = utilities.return_initial_cancer_entities(
129
  query, model="en_ner_bc5cdr_md"
@@ -151,7 +154,7 @@ def construct_and_execute_api_call(query):
151
  query=query,
152
  gdc_genes_mutations=gdc_genes_mutations,
153
  )
154
- print('\nStep 2: Entity Extraction\n')
155
  print("gene entities {}".format(gene_entities))
156
  print("mutation entities {}".format(mutation_entities))
157
  print("cancer entities {}".format(cancer_entities))
@@ -160,7 +163,7 @@ def construct_and_execute_api_call(query):
160
  intent = infer_user_intent(query)
161
  print("\nStep 3: Intent Inference:\n{}\n".format(intent))
162
  try:
163
- print('\nStep 4: API call builder for intent {}\n'.format(intent))
164
  api_call_result, cancer_entities = execute_api_call(
165
  intent, gene_entities, mutation_entities, cancer_entities, query
166
  )
@@ -230,19 +233,19 @@ def get_prefinal_response(row):
230
  helper_output = row["helper_output"]
231
  except Exception as e:
232
  print(f"unable to retrieve query: {query} or helper_output: {helper_output}")
233
- print('\nStep 6: Augment LLM prompt for llama-3B\n')
234
  modified_query = utilities.construct_modified_query(query, helper_output)
235
- print('{}'.format(modified_query))
236
- print('\nStep 7: Generate LLM response R on query augmented prompt\n')
237
  prefinal_llama_with_helper_output = generate_response(modified_query)
238
- print('{}'.format(prefinal_llama_with_helper_output))
239
  return pd.Series([modified_query, prefinal_llama_with_helper_output])
240
 
241
 
242
  def execute_pipeline(question: str):
243
  df = pd.DataFrame({"questions": [question]})
244
  print(f"\n\nQuestion received: {question}\n")
245
- print("CUDA device name:", torch.cuda.get_device_name(0))
246
 
247
  # queries input file
248
  df[
@@ -295,11 +298,13 @@ def execute_pipeline(question: str):
295
  )
296
  result.index = ["GDC-QAG results"] * len(result)
297
 
298
- print('Query Augmented Generation final response {}'.format(
299
- '\n'.join(result['Query augmented generation'].astype(str))
300
- ))
 
 
301
  print("completed")
302
-
303
  print("\nWriting result string now\n")
304
 
305
  result = result.T.to_dict()
@@ -309,7 +314,9 @@ def execute_pipeline(question: str):
309
 
310
  result_string += f"Question: {result['GDC-QAG results']['Question']}\n"
311
  result_string += f"llama-3B baseline output: {result['GDC-QAG results']['llama-3B baseline frequency']}%\n"
312
- result_string += f"Query augmented prompt: {result['GDC-QAG results']['Query augmented prompt']}"
 
 
313
  result_string += f"Query augmented generation: {result['GDC-QAG results']['Query augmented generation']}"
314
 
315
  return result_string
 
18
 
19
  from methods import gdc_api_calls, utilities
20
 
 
21
  # set up various tokens
22
  hf_TOKEN = os.environ.get("hf_svc_ctds", False)
23
 
 
122
 
123
  # function to combine entities, intent and API call
124
  def construct_and_execute_api_call(query):
125
+ print(
126
+ "\nStep 1: Starting GDC-QAG on input natural language query:\n{}\n".format(
127
+ query
128
+ )
129
+ )
130
  # Infer entities
131
  initial_cancer_entities = utilities.return_initial_cancer_entities(
132
  query, model="en_ner_bc5cdr_md"
 
154
  query=query,
155
  gdc_genes_mutations=gdc_genes_mutations,
156
  )
157
+ print("\nStep 2: Entity Extraction\n")
158
  print("gene entities {}".format(gene_entities))
159
  print("mutation entities {}".format(mutation_entities))
160
  print("cancer entities {}".format(cancer_entities))
 
163
  intent = infer_user_intent(query)
164
  print("\nStep 3: Intent Inference:\n{}\n".format(intent))
165
  try:
166
+ print("\nStep 4: API call builder for intent {}\n".format(intent))
167
  api_call_result, cancer_entities = execute_api_call(
168
  intent, gene_entities, mutation_entities, cancer_entities, query
169
  )
 
233
  helper_output = row["helper_output"]
234
  except Exception as e:
235
  print(f"unable to retrieve query: {query} or helper_output: {helper_output}")
236
+ print("\nStep 6: Augment LLM prompt for llama-3B\n")
237
  modified_query = utilities.construct_modified_query(query, helper_output)
238
+ print("{}".format(modified_query))
239
+ print("\nStep 7: Generate LLM response R on query augmented prompt\n")
240
  prefinal_llama_with_helper_output = generate_response(modified_query)
241
+ print("{}".format(prefinal_llama_with_helper_output))
242
  return pd.Series([modified_query, prefinal_llama_with_helper_output])
243
 
244
 
245
  def execute_pipeline(question: str):
246
  df = pd.DataFrame({"questions": [question]})
247
  print(f"\n\nQuestion received: {question}\n")
248
+ # print("CUDA device name:", torch.cuda.get_device_name(0))
249
 
250
  # queries input file
251
  df[
 
298
  )
299
  result.index = ["GDC-QAG results"] * len(result)
300
 
301
+ print(
302
+ "Query Augmented Generation final response {}".format(
303
+ "\n".join(result["Query augmented generation"].astype(str))
304
+ )
305
+ )
306
  print("completed")
307
+
308
  print("\nWriting result string now\n")
309
 
310
  result = result.T.to_dict()
 
314
 
315
  result_string += f"Question: {result['GDC-QAG results']['Question']}\n"
316
  result_string += f"llama-3B baseline output: {result['GDC-QAG results']['llama-3B baseline frequency']}%\n"
317
+ result_string += (
318
+ f"Query augmented prompt: {result['GDC-QAG results']['Query augmented prompt']}"
319
+ )
320
  result_string += f"Query augmented generation: {result['GDC-QAG results']['Query augmented generation']}"
321
 
322
  return result_string
gdc_pipeline.py CHANGED
@@ -151,13 +151,7 @@ def generate_response(modified_query, model, tok):
151
 
152
 
153
  def batch_test(
154
- query,
155
- model,
156
- tok,
157
- gdc_genes_mutations,
158
- project_mappings,
159
- intent_model,
160
- intent_tok
161
  ):
162
  modified_query = utilities.construct_modified_query_base_llm(query)
163
  print(f"modified_query is: {modified_query}")
@@ -229,7 +223,7 @@ def setup_models_and_data(AUTH_TOKEN, llama_token, intent_token):
229
  # retrieve and load GDC project mappings
230
  project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
231
 
232
- print('loading intent model')
233
  intent_model, intent_tok = utilities.load_intent_model_hf(intent_token)
234
 
235
  print("loading gdc genes and mutations")
@@ -244,19 +238,24 @@ def setup_models_and_data(AUTH_TOKEN, llama_token, intent_token):
244
  model=model,
245
  tok=tok,
246
  intent_model=intent_model,
247
- intent_tok=intent_tok
248
  )
249
 
250
 
251
  @utilities.timeit
252
  def execute_pipeline(
253
- df, gdc_genes_mutations, model,
254
- tok, intent_model, intent_tok,
255
- project_mappings, output_file_prefix
 
 
 
 
 
256
  ):
257
  print("starting pipeline")
258
- print("CUDA available:", torch.cuda.is_available())
259
- print("CUDA device name:", torch.cuda.get_device_name(0))
260
 
261
  # queries input file
262
  print(f"running test on input {df}")
@@ -277,7 +276,7 @@ def execute_pipeline(
277
  gdc_genes_mutations,
278
  project_mappings,
279
  intent_model,
280
- intent_tok
281
  )
282
  )
283
 
@@ -326,8 +325,8 @@ def execute_pipeline(
326
  result = df_filtered_exploded
327
  else:
328
  result = df_filtered_exploded[final_columns].T
329
- print('result {}'.format(result))
330
- print('completed')
331
  return result
332
 
333
 
@@ -342,26 +341,26 @@ def main():
342
  df = pd.read_csv(input_file)
343
  output_file_prefix = os.path.basename(input_file).split(".")[0]
344
  execute_pipeline(
345
- df,
346
  qag_requirements.gdc_genes_mutations,
347
  qag_requirements.model,
348
  qag_requirements.tok,
349
  qag_requirements.intent_model,
350
  qag_requirements.intent_tok,
351
  qag_requirements.project_mappings,
352
- output_file_prefix
353
  )
354
  elif question:
355
  df = pd.DataFrame({"questions": [question]})
356
  execute_pipeline(
357
- df,
358
  qag_requirements.gdc_genes_mutations,
359
  qag_requirements.model,
360
  qag_requirements.tok,
361
  qag_requirements.intent_model,
362
  qag_requirements.intent_tok,
363
  qag_requirements.project_mappings,
364
- output_file_prefix=None
365
  )
366
 
367
 
 
151
 
152
 
153
  def batch_test(
154
+ query, model, tok, gdc_genes_mutations, project_mappings, intent_model, intent_tok
 
 
 
 
 
 
155
  ):
156
  modified_query = utilities.construct_modified_query_base_llm(query)
157
  print(f"modified_query is: {modified_query}")
 
223
  # retrieve and load GDC project mappings
224
  project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
225
 
226
+ print("loading intent model")
227
  intent_model, intent_tok = utilities.load_intent_model_hf(intent_token)
228
 
229
  print("loading gdc genes and mutations")
 
238
  model=model,
239
  tok=tok,
240
  intent_model=intent_model,
241
+ intent_tok=intent_tok,
242
  )
243
 
244
 
245
  @utilities.timeit
246
  def execute_pipeline(
247
+ df,
248
+ gdc_genes_mutations,
249
+ model,
250
+ tok,
251
+ intent_model,
252
+ intent_tok,
253
+ project_mappings,
254
+ output_file_prefix,
255
  ):
256
  print("starting pipeline")
257
+ # print("CUDA available:", torch.cuda.is_available())
258
+ # print("CUDA device name:", torch.cuda.get_device_name(0))
259
 
260
  # queries input file
261
  print(f"running test on input {df}")
 
276
  gdc_genes_mutations,
277
  project_mappings,
278
  intent_model,
279
+ intent_tok,
280
  )
281
  )
282
 
 
325
  result = df_filtered_exploded
326
  else:
327
  result = df_filtered_exploded[final_columns].T
328
+ print("result {}".format(result))
329
+ print("completed")
330
  return result
331
 
332
 
 
341
  df = pd.read_csv(input_file)
342
  output_file_prefix = os.path.basename(input_file).split(".")[0]
343
  execute_pipeline(
344
+ df,
345
  qag_requirements.gdc_genes_mutations,
346
  qag_requirements.model,
347
  qag_requirements.tok,
348
  qag_requirements.intent_model,
349
  qag_requirements.intent_tok,
350
  qag_requirements.project_mappings,
351
+ output_file_prefix,
352
  )
353
  elif question:
354
  df = pd.DataFrame({"questions": [question]})
355
  execute_pipeline(
356
+ df,
357
  qag_requirements.gdc_genes_mutations,
358
  qag_requirements.model,
359
  qag_requirements.tok,
360
  qag_requirements.intent_model,
361
  qag_requirements.intent_tok,
362
  qag_requirements.project_mappings,
363
+ output_file_prefix=None,
364
  )
365
 
366
 
requirements.txt CHANGED
@@ -1,7 +1,6 @@
1
  accelerate (>=1.9.0,<2.0.0)
2
  en-core-sci-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz#sha256=7c8fc52542dd1452ffce00b045c1298e2c185b7cf84793f8e0ec941987c09808
3
  en-ner-bc5cdr-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz#sha256=ffc73130a710edf851206199720cb2c744a043e032f5da6ba4bb36863deca778
4
- gpustat
5
  gradio (>=5.35.0,<6.0.0)
6
  guidance (>=0.2.4,<0.3.0)
7
  huggingface-hub (>=0.33.2,<0.34.0)
 
1
  accelerate (>=1.9.0,<2.0.0)
2
  en-core-sci-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz#sha256=7c8fc52542dd1452ffce00b045c1298e2c185b7cf84793f8e0ec941987c09808
3
  en-ner-bc5cdr-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz#sha256=ffc73130a710edf851206199720cb2c744a043e032f5da6ba4bb36863deca778
 
4
  gradio (>=5.35.0,<6.0.0)
5
  guidance (>=0.2.4,<0.3.0)
6
  huggingface-hub (>=0.33.2,<0.34.0)