Spaces:
Sleeping
Sleeping
Michael commited on
Commit ·
9b4901c
1
Parent(s): efc14ff
remove gpustat calls
Browse files- app.py +21 -14
- gdc_pipeline.py +20 -21
- requirements.txt +0 -1
app.py
CHANGED
|
@@ -18,7 +18,6 @@ from transformers import (
|
|
| 18 |
|
| 19 |
from methods import gdc_api_calls, utilities
|
| 20 |
|
| 21 |
-
|
| 22 |
# set up various tokens
|
| 23 |
hf_TOKEN = os.environ.get("hf_svc_ctds", False)
|
| 24 |
|
|
@@ -123,7 +122,11 @@ def infer_user_intent(query):
|
|
| 123 |
|
| 124 |
# function to combine entities, intent and API call
|
| 125 |
def construct_and_execute_api_call(query):
|
| 126 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
# Infer entities
|
| 128 |
initial_cancer_entities = utilities.return_initial_cancer_entities(
|
| 129 |
query, model="en_ner_bc5cdr_md"
|
|
@@ -151,7 +154,7 @@ def construct_and_execute_api_call(query):
|
|
| 151 |
query=query,
|
| 152 |
gdc_genes_mutations=gdc_genes_mutations,
|
| 153 |
)
|
| 154 |
-
print(
|
| 155 |
print("gene entities {}".format(gene_entities))
|
| 156 |
print("mutation entities {}".format(mutation_entities))
|
| 157 |
print("cancer entities {}".format(cancer_entities))
|
|
@@ -160,7 +163,7 @@ def construct_and_execute_api_call(query):
|
|
| 160 |
intent = infer_user_intent(query)
|
| 161 |
print("\nStep 3: Intent Inference:\n{}\n".format(intent))
|
| 162 |
try:
|
| 163 |
-
print(
|
| 164 |
api_call_result, cancer_entities = execute_api_call(
|
| 165 |
intent, gene_entities, mutation_entities, cancer_entities, query
|
| 166 |
)
|
|
@@ -230,19 +233,19 @@ def get_prefinal_response(row):
|
|
| 230 |
helper_output = row["helper_output"]
|
| 231 |
except Exception as e:
|
| 232 |
print(f"unable to retrieve query: {query} or helper_output: {helper_output}")
|
| 233 |
-
print(
|
| 234 |
modified_query = utilities.construct_modified_query(query, helper_output)
|
| 235 |
-
print(
|
| 236 |
-
print(
|
| 237 |
prefinal_llama_with_helper_output = generate_response(modified_query)
|
| 238 |
-
print(
|
| 239 |
return pd.Series([modified_query, prefinal_llama_with_helper_output])
|
| 240 |
|
| 241 |
|
| 242 |
def execute_pipeline(question: str):
|
| 243 |
df = pd.DataFrame({"questions": [question]})
|
| 244 |
print(f"\n\nQuestion received: {question}\n")
|
| 245 |
-
print("CUDA device name:", torch.cuda.get_device_name(0))
|
| 246 |
|
| 247 |
# queries input file
|
| 248 |
df[
|
|
@@ -295,11 +298,13 @@ def execute_pipeline(question: str):
|
|
| 295 |
)
|
| 296 |
result.index = ["GDC-QAG results"] * len(result)
|
| 297 |
|
| 298 |
-
print(
|
| 299 |
-
|
| 300 |
-
|
|
|
|
|
|
|
| 301 |
print("completed")
|
| 302 |
-
|
| 303 |
print("\nWriting result string now\n")
|
| 304 |
|
| 305 |
result = result.T.to_dict()
|
|
@@ -309,7 +314,9 @@ def execute_pipeline(question: str):
|
|
| 309 |
|
| 310 |
result_string += f"Question: {result['GDC-QAG results']['Question']}\n"
|
| 311 |
result_string += f"llama-3B baseline output: {result['GDC-QAG results']['llama-3B baseline frequency']}%\n"
|
| 312 |
-
result_string +=
|
|
|
|
|
|
|
| 313 |
result_string += f"Query augmented generation: {result['GDC-QAG results']['Query augmented generation']}"
|
| 314 |
|
| 315 |
return result_string
|
|
|
|
| 18 |
|
| 19 |
from methods import gdc_api_calls, utilities
|
| 20 |
|
|
|
|
| 21 |
# set up various tokens
|
| 22 |
hf_TOKEN = os.environ.get("hf_svc_ctds", False)
|
| 23 |
|
|
|
|
| 122 |
|
| 123 |
# function to combine entities, intent and API call
|
| 124 |
def construct_and_execute_api_call(query):
|
| 125 |
+
print(
|
| 126 |
+
"\nStep 1: Starting GDC-QAG on input natural language query:\n{}\n".format(
|
| 127 |
+
query
|
| 128 |
+
)
|
| 129 |
+
)
|
| 130 |
# Infer entities
|
| 131 |
initial_cancer_entities = utilities.return_initial_cancer_entities(
|
| 132 |
query, model="en_ner_bc5cdr_md"
|
|
|
|
| 154 |
query=query,
|
| 155 |
gdc_genes_mutations=gdc_genes_mutations,
|
| 156 |
)
|
| 157 |
+
print("\nStep 2: Entity Extraction\n")
|
| 158 |
print("gene entities {}".format(gene_entities))
|
| 159 |
print("mutation entities {}".format(mutation_entities))
|
| 160 |
print("cancer entities {}".format(cancer_entities))
|
|
|
|
| 163 |
intent = infer_user_intent(query)
|
| 164 |
print("\nStep 3: Intent Inference:\n{}\n".format(intent))
|
| 165 |
try:
|
| 166 |
+
print("\nStep 4: API call builder for intent {}\n".format(intent))
|
| 167 |
api_call_result, cancer_entities = execute_api_call(
|
| 168 |
intent, gene_entities, mutation_entities, cancer_entities, query
|
| 169 |
)
|
|
|
|
| 233 |
helper_output = row["helper_output"]
|
| 234 |
except Exception as e:
|
| 235 |
print(f"unable to retrieve query: {query} or helper_output: {helper_output}")
|
| 236 |
+
print("\nStep 6: Augment LLM prompt for llama-3B\n")
|
| 237 |
modified_query = utilities.construct_modified_query(query, helper_output)
|
| 238 |
+
print("{}".format(modified_query))
|
| 239 |
+
print("\nStep 7: Generate LLM response R on query augmented prompt\n")
|
| 240 |
prefinal_llama_with_helper_output = generate_response(modified_query)
|
| 241 |
+
print("{}".format(prefinal_llama_with_helper_output))
|
| 242 |
return pd.Series([modified_query, prefinal_llama_with_helper_output])
|
| 243 |
|
| 244 |
|
| 245 |
def execute_pipeline(question: str):
|
| 246 |
df = pd.DataFrame({"questions": [question]})
|
| 247 |
print(f"\n\nQuestion received: {question}\n")
|
| 248 |
+
# print("CUDA device name:", torch.cuda.get_device_name(0))
|
| 249 |
|
| 250 |
# queries input file
|
| 251 |
df[
|
|
|
|
| 298 |
)
|
| 299 |
result.index = ["GDC-QAG results"] * len(result)
|
| 300 |
|
| 301 |
+
print(
|
| 302 |
+
"Query Augmented Generation final response {}".format(
|
| 303 |
+
"\n".join(result["Query augmented generation"].astype(str))
|
| 304 |
+
)
|
| 305 |
+
)
|
| 306 |
print("completed")
|
| 307 |
+
|
| 308 |
print("\nWriting result string now\n")
|
| 309 |
|
| 310 |
result = result.T.to_dict()
|
|
|
|
| 314 |
|
| 315 |
result_string += f"Question: {result['GDC-QAG results']['Question']}\n"
|
| 316 |
result_string += f"llama-3B baseline output: {result['GDC-QAG results']['llama-3B baseline frequency']}%\n"
|
| 317 |
+
result_string += (
|
| 318 |
+
f"Query augmented prompt: {result['GDC-QAG results']['Query augmented prompt']}"
|
| 319 |
+
)
|
| 320 |
result_string += f"Query augmented generation: {result['GDC-QAG results']['Query augmented generation']}"
|
| 321 |
|
| 322 |
return result_string
|
gdc_pipeline.py
CHANGED
|
@@ -151,13 +151,7 @@ def generate_response(modified_query, model, tok):
|
|
| 151 |
|
| 152 |
|
| 153 |
def batch_test(
|
| 154 |
-
query,
|
| 155 |
-
model,
|
| 156 |
-
tok,
|
| 157 |
-
gdc_genes_mutations,
|
| 158 |
-
project_mappings,
|
| 159 |
-
intent_model,
|
| 160 |
-
intent_tok
|
| 161 |
):
|
| 162 |
modified_query = utilities.construct_modified_query_base_llm(query)
|
| 163 |
print(f"modified_query is: {modified_query}")
|
|
@@ -229,7 +223,7 @@ def setup_models_and_data(AUTH_TOKEN, llama_token, intent_token):
|
|
| 229 |
# retrieve and load GDC project mappings
|
| 230 |
project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
|
| 231 |
|
| 232 |
-
print(
|
| 233 |
intent_model, intent_tok = utilities.load_intent_model_hf(intent_token)
|
| 234 |
|
| 235 |
print("loading gdc genes and mutations")
|
|
@@ -244,19 +238,24 @@ def setup_models_and_data(AUTH_TOKEN, llama_token, intent_token):
|
|
| 244 |
model=model,
|
| 245 |
tok=tok,
|
| 246 |
intent_model=intent_model,
|
| 247 |
-
intent_tok=intent_tok
|
| 248 |
)
|
| 249 |
|
| 250 |
|
| 251 |
@utilities.timeit
|
| 252 |
def execute_pipeline(
|
| 253 |
-
df,
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
):
|
| 257 |
print("starting pipeline")
|
| 258 |
-
print("CUDA available:", torch.cuda.is_available())
|
| 259 |
-
print("CUDA device name:", torch.cuda.get_device_name(0))
|
| 260 |
|
| 261 |
# queries input file
|
| 262 |
print(f"running test on input {df}")
|
|
@@ -277,7 +276,7 @@ def execute_pipeline(
|
|
| 277 |
gdc_genes_mutations,
|
| 278 |
project_mappings,
|
| 279 |
intent_model,
|
| 280 |
-
intent_tok
|
| 281 |
)
|
| 282 |
)
|
| 283 |
|
|
@@ -326,8 +325,8 @@ def execute_pipeline(
|
|
| 326 |
result = df_filtered_exploded
|
| 327 |
else:
|
| 328 |
result = df_filtered_exploded[final_columns].T
|
| 329 |
-
print(
|
| 330 |
-
print(
|
| 331 |
return result
|
| 332 |
|
| 333 |
|
|
@@ -342,26 +341,26 @@ def main():
|
|
| 342 |
df = pd.read_csv(input_file)
|
| 343 |
output_file_prefix = os.path.basename(input_file).split(".")[0]
|
| 344 |
execute_pipeline(
|
| 345 |
-
df,
|
| 346 |
qag_requirements.gdc_genes_mutations,
|
| 347 |
qag_requirements.model,
|
| 348 |
qag_requirements.tok,
|
| 349 |
qag_requirements.intent_model,
|
| 350 |
qag_requirements.intent_tok,
|
| 351 |
qag_requirements.project_mappings,
|
| 352 |
-
output_file_prefix
|
| 353 |
)
|
| 354 |
elif question:
|
| 355 |
df = pd.DataFrame({"questions": [question]})
|
| 356 |
execute_pipeline(
|
| 357 |
-
df,
|
| 358 |
qag_requirements.gdc_genes_mutations,
|
| 359 |
qag_requirements.model,
|
| 360 |
qag_requirements.tok,
|
| 361 |
qag_requirements.intent_model,
|
| 362 |
qag_requirements.intent_tok,
|
| 363 |
qag_requirements.project_mappings,
|
| 364 |
-
output_file_prefix=None
|
| 365 |
)
|
| 366 |
|
| 367 |
|
|
|
|
| 151 |
|
| 152 |
|
| 153 |
def batch_test(
|
| 154 |
+
query, model, tok, gdc_genes_mutations, project_mappings, intent_model, intent_tok
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
):
|
| 156 |
modified_query = utilities.construct_modified_query_base_llm(query)
|
| 157 |
print(f"modified_query is: {modified_query}")
|
|
|
|
| 223 |
# retrieve and load GDC project mappings
|
| 224 |
project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
|
| 225 |
|
| 226 |
+
print("loading intent model")
|
| 227 |
intent_model, intent_tok = utilities.load_intent_model_hf(intent_token)
|
| 228 |
|
| 229 |
print("loading gdc genes and mutations")
|
|
|
|
| 238 |
model=model,
|
| 239 |
tok=tok,
|
| 240 |
intent_model=intent_model,
|
| 241 |
+
intent_tok=intent_tok,
|
| 242 |
)
|
| 243 |
|
| 244 |
|
| 245 |
@utilities.timeit
|
| 246 |
def execute_pipeline(
|
| 247 |
+
df,
|
| 248 |
+
gdc_genes_mutations,
|
| 249 |
+
model,
|
| 250 |
+
tok,
|
| 251 |
+
intent_model,
|
| 252 |
+
intent_tok,
|
| 253 |
+
project_mappings,
|
| 254 |
+
output_file_prefix,
|
| 255 |
):
|
| 256 |
print("starting pipeline")
|
| 257 |
+
# print("CUDA available:", torch.cuda.is_available())
|
| 258 |
+
# print("CUDA device name:", torch.cuda.get_device_name(0))
|
| 259 |
|
| 260 |
# queries input file
|
| 261 |
print(f"running test on input {df}")
|
|
|
|
| 276 |
gdc_genes_mutations,
|
| 277 |
project_mappings,
|
| 278 |
intent_model,
|
| 279 |
+
intent_tok,
|
| 280 |
)
|
| 281 |
)
|
| 282 |
|
|
|
|
| 325 |
result = df_filtered_exploded
|
| 326 |
else:
|
| 327 |
result = df_filtered_exploded[final_columns].T
|
| 328 |
+
print("result {}".format(result))
|
| 329 |
+
print("completed")
|
| 330 |
return result
|
| 331 |
|
| 332 |
|
|
|
|
| 341 |
df = pd.read_csv(input_file)
|
| 342 |
output_file_prefix = os.path.basename(input_file).split(".")[0]
|
| 343 |
execute_pipeline(
|
| 344 |
+
df,
|
| 345 |
qag_requirements.gdc_genes_mutations,
|
| 346 |
qag_requirements.model,
|
| 347 |
qag_requirements.tok,
|
| 348 |
qag_requirements.intent_model,
|
| 349 |
qag_requirements.intent_tok,
|
| 350 |
qag_requirements.project_mappings,
|
| 351 |
+
output_file_prefix,
|
| 352 |
)
|
| 353 |
elif question:
|
| 354 |
df = pd.DataFrame({"questions": [question]})
|
| 355 |
execute_pipeline(
|
| 356 |
+
df,
|
| 357 |
qag_requirements.gdc_genes_mutations,
|
| 358 |
qag_requirements.model,
|
| 359 |
qag_requirements.tok,
|
| 360 |
qag_requirements.intent_model,
|
| 361 |
qag_requirements.intent_tok,
|
| 362 |
qag_requirements.project_mappings,
|
| 363 |
+
output_file_prefix=None,
|
| 364 |
)
|
| 365 |
|
| 366 |
|
requirements.txt
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
accelerate (>=1.9.0,<2.0.0)
|
| 2 |
en-core-sci-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz#sha256=7c8fc52542dd1452ffce00b045c1298e2c185b7cf84793f8e0ec941987c09808
|
| 3 |
en-ner-bc5cdr-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz#sha256=ffc73130a710edf851206199720cb2c744a043e032f5da6ba4bb36863deca778
|
| 4 |
-
gpustat
|
| 5 |
gradio (>=5.35.0,<6.0.0)
|
| 6 |
guidance (>=0.2.4,<0.3.0)
|
| 7 |
huggingface-hub (>=0.33.2,<0.34.0)
|
|
|
|
| 1 |
accelerate (>=1.9.0,<2.0.0)
|
| 2 |
en-core-sci-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz#sha256=7c8fc52542dd1452ffce00b045c1298e2c185b7cf84793f8e0ec941987c09808
|
| 3 |
en-ner-bc5cdr-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz#sha256=ffc73130a710edf851206199720cb2c744a043e032f5da6ba4bb36863deca778
|
|
|
|
| 4 |
gradio (>=5.35.0,<6.0.0)
|
| 5 |
guidance (>=0.2.4,<0.3.0)
|
| 6 |
huggingface-hub (>=0.33.2,<0.34.0)
|