Roland Ding commited on
Commit
667bfca
·
1 Parent(s): 20fc5ea

2.3.11.32 updated features, completed process_study as the one stroke process for clinical report, completed create_overview, create_details for markdown ui content population, completed the select_prompts process to align with the prompt selection logic as per prior meetings (notion to be added in the spec at later day).

Browse files
Files changed (1) hide show
  1. features.py +199 -84
features.py CHANGED
@@ -3,6 +3,7 @@ from datetime import datetime
3
  from operator import mul
4
  from functools import reduce
5
  from sys import stdout
 
6
 
7
  # external packages
8
  import gradio as gr
@@ -15,72 +16,121 @@ from supplier import *
15
 
16
  encoding = tiktoken.get_encoding("cl100k_base")
17
 
 
 
 
 
 
 
 
 
 
 
18
  def process_study(
19
  study_file_obj,
 
20
  performance_metric_1,
21
  performance_metric_2,
22
  safety_metric_1,
23
  safety_metric_2,
24
  device=default_device
25
  ):
26
- if study_file_obj is None:
27
- return "", "", ""
28
 
29
- article = add_article(device,study_file_obj)
30
- content = extract_key_content(article["content"],["abstract","objective"],["discussion"])
31
- assessments = select_prompts(content)
32
-
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  output = {
34
  "domain":article["domain"],
35
  "article":article["name"],
36
- "output":{}
37
  }
38
 
39
- n_assessments = len(assessments)
40
- c = 1
 
 
41
 
42
- for a, prompts in assessments.items():
43
- # run prompts with the content
44
- output["output"][a] = []
45
- n_prompts = len(prompts)
46
- for i,p in enumerate(prompts):
47
- # run prompt on content and append it to the outputs[output][assessment]
48
- prompt_text = f"{content}\n\n {p}\n"
49
- # print(len(encoding.encode(prompt_text)))
50
- feedback = execute_prompt(prompt_text)
51
- # print(feedback)
52
- output["output"][a].append(process_feedback(feedback))
53
- stdout.write(f"{c}/{n_assessments} - {i+1}/{n_prompts}\r")
54
 
55
- c += 1
56
 
57
- add_output(output)
 
 
 
 
 
58
 
59
- overview = create_overview(output)
60
- performance = create_performance(output)
61
- safety = create_safety(output)
 
62
 
63
- return overview, performance, safety
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- def create_overview(output):
66
- # raw_text = output["output"]["Clinical Overview"]
67
- raw_text = "work in progress"
68
- overview = f"<hr /><p>{raw_text}</p>"
69
- return gr.update(value=overview)
70
 
71
- def create_performance(output):
72
- performances = output["output"]["Clinical Performance"]
73
  md_text = ""
74
- for p in performances:
75
- md_text += f"<hr /><p>{p}</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  return gr.update(value=md_text)
77
 
78
- def create_safety(output):
79
- raw_text = output["output"]["Safety"]
80
- safety = f"<hr /><p>{raw_text}</p>"
81
- return gr.update(value=safety)
82
 
83
- def extract_key_content(text,start,end,case_sensitive=False):
84
  '''
85
  this function extract the content between start and end
86
  and return the content in between. The function will find
@@ -112,13 +162,20 @@ def extract_key_content(text,start,end,case_sensitive=False):
112
  start_index = 0
113
  for s in start:
114
  start_index = max(start_index,text.find(s))
115
-
 
 
116
  end_index = 0
117
  for e in end:
118
- end_index = max(end_index,text[start_index:].find(e)) if start_index!=-1 else max(end_index,text.find(e))
119
 
120
- content = origin[start_index:start_index+end_index] if start_index!=-1 else origin[:end_index]
121
- return content
 
 
 
 
 
122
 
123
  def get_articles(update_local=True):
124
  '''
@@ -160,7 +217,7 @@ def get_article(domain,name):
160
 
161
  return article
162
 
163
- def add_article(domain,file_obj,add_to_s3=True, add_to_local=True):
164
  '''
165
  this function receive the domain name and file obj
166
  and add the article to the cloud, s3 and local memory
@@ -181,17 +238,31 @@ def add_article(domain,file_obj,add_to_s3=True, add_to_local=True):
181
  dict
182
  article object
183
  '''
184
- content, meta = read_pdf(file_obj)
 
 
 
 
 
 
 
 
 
 
 
 
185
  article ={
186
  "domain":domain,
187
- "name":file_obj.name.split("\\")[-1].split(".")[0],
188
  "content":content,
189
- # "meta":meta,
 
 
190
  "upload_time":datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
191
  }
192
 
193
- if add_to_s3:
194
- s3_path = upload_fileobj(file_obj,domain,article["name"])
195
  article["s3_path"] = s3_path
196
 
197
  if add_to_local:
@@ -264,37 +335,6 @@ def update_article(article,file_obj=None,update_local=True):
264
 
265
  return article
266
 
267
- def process_feedback(text):
268
- return text
269
-
270
- def select_prompts(content):
271
- '''
272
- select the prompts based on the content and the search terms
273
- that was included in the content
274
-
275
- Parameters
276
- ----------
277
- content : str
278
- content of the article
279
-
280
- Returns
281
- -------
282
- dict
283
- prompts
284
- '''
285
-
286
- prompts = {}
287
- for a in assessments:
288
- prompts[a] = set()
289
-
290
-
291
- for p in app_data["terms"]:
292
- p["terms"] = p["term"].split(",")
293
- if reduce(mul, [s in content for s in p["terms"]], 1):
294
- prompts[p["assessment_step"]].add(p["command"])
295
-
296
- return prompts
297
-
298
  def add_output(output):
299
  '''
300
  this function add the output to the cloud
@@ -336,7 +376,7 @@ def update_output(output):
336
  return False
337
  return True
338
 
339
- def add_device():
340
  pass
341
 
342
  def get_device():
@@ -346,4 +386,79 @@ def remove_device():
346
  pass
347
 
348
  def update_device():
349
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from operator import mul
4
  from functools import reduce
5
  from sys import stdout
6
+ from collections import defaultdict
7
 
8
  # external packages
9
  import gradio as gr
 
16
 
17
  encoding = tiktoken.get_encoding("cl100k_base")
18
 
19
+ # get prompts, terms, outputs from the cloud
20
+ def init_app_data():
21
+ '''
22
+ a function to initialize the application data from the cloud backend
23
+ '''
24
+ app_data["prompts"] = get_table("prompts")
25
+ app_data["terms"] = get_table("terms")
26
+ app_data["outputs"] = get_table("outputs")
27
+ app_data["articles"] = get_table("articles")
28
+
29
  def process_study(
30
  study_file_obj,
31
+ study_content,
32
  performance_metric_1,
33
  performance_metric_2,
34
  safety_metric_1,
35
  safety_metric_2,
36
  device=default_device
37
  ):
 
 
38
 
39
+ if study_file_obj:
40
+ article = add_article(device,study_file_obj)
41
+ elif study_content:
42
+ article = add_article(device,study_content,file_object=False)
43
+ else:
44
+ return "No file or content provided","No file or content provided","No file or content provided"
45
+
46
+ prompts = select_prompts( # need to identify how the app will know which prompts to use
47
+ article,
48
+ performance_metric_1,
49
+ performance_metric_2,
50
+ safety_metric_1,
51
+ safety_metric_2
52
+
53
+ )
54
+ # print("check prompts",prompts)
55
+
56
  output = {
57
  "domain":article["domain"],
58
  "article":article["name"],
59
+ "output":defaultdict(list)
60
  }
61
 
62
+ for p in prompts:
63
+ prompt_string = ""
64
+ for s in p["sections"].split(","):
65
+ prompt_string += f"{article[s]}"
66
 
67
+ prompt_string += f"\n {p['prompt']}"
68
+ with open(f"prompt_{p['template_name']}.txt","w") as f:
69
+ f.write(prompt_string)
 
 
 
 
 
 
 
 
 
70
 
71
+ res = execute_prompt(prompt_string)
72
 
73
+ with open(f"output_{p['template_name']}.txt","w") as f:
74
+ f.write(res)
75
+
76
+ output["output"][p["assessment_step"]].append(res)
77
+
78
+
79
 
80
+ overview = create_overview(output["output"]["overview"])
81
+ details = create_details(output["output"])
82
+
83
+ add_output(output)
84
 
85
+ return overview, details
86
+
87
+ def refresh():
88
+ '''
89
+ this function refresh the application data from the cloud backend
90
+ '''
91
+ init_app_data()
92
+ return "refreshed", "refreshed"
93
+
94
+ def create_overview(overview_list):
95
+ '''
96
+ '''
97
+ md_text = "## Overview\n\n"
98
+ md_text += "| attributes | detail |\n|:---|:---|\n"
99
+ for v in overview_list:
100
+ r = v.replace("\n\n","")
101
+ rows = r.split("\n")
102
+ for r in rows:
103
+ c = r.replace(": "," | ")
104
+ md_text += f"| {c} |\n"
105
+ # with open("overview.md","w") as f:
106
+ # f.write(md_text)
107
+ return gr.update(value=md_text)
108
 
109
+ def create_details(output):
110
+ sections = ["clinical", "radiographic", "fussion assessment", "other","safety"]
111
+ titles = ["Clinical Outcomes", "Radiological Outcomes", "Fussion Assessment", "Other Outcomes","Safety Outcomes"]
 
 
112
 
 
 
113
  md_text = ""
114
+ for section, title in zip(sections, titles):
115
+ md_text += f"## {title}\n\n"
116
+ # print(output[section])
117
+
118
+ for i,table in enumerate(output[section]):
119
+ table = table.replace("\n\n","")
120
+ rows = table.split("\n")
121
+ for i,r in enumerate(rows):
122
+ cells = r.split("\t")
123
+ md_text += f"| {' | '.join(cells)} |\n"
124
+ if i == 0:
125
+ md_text += "|:---"*len(cells)+"|\n"
126
+
127
+ md_text += "\n\n"
128
+ # with open("details.md","w") as f:
129
+ # f.write(md_text)
130
  return gr.update(value=md_text)
131
 
 
 
 
 
132
 
133
+ def extract_key_content(text,start,end,before = None,case_sensitive=False):
134
  '''
135
  this function extract the content between start and end
136
  and return the content in between. The function will find
 
162
  start_index = 0
163
  for s in start:
164
  start_index = max(start_index,text.find(s))
165
+
166
+ if start_index ==-1: start_index = 0
167
+
168
  end_index = 0
169
  for e in end:
170
+ end_index = max(end_index,text[start_index:].find(e))
171
 
172
+ if before:
173
+ for b in before:
174
+ before_index = text[start_index:start_index+end_index].find(b)
175
+ end_index = min(end_index,before_index) if before_index != -1 and before_index >=800 else end_index # 800 is a magic number for the length of the abstract
176
+
177
+ content = origin[start_index:start_index+end_index]
178
+ return content, start_index, start_index+end_index
179
 
180
  def get_articles(update_local=True):
181
  '''
 
217
 
218
  return article
219
 
220
+ def add_article(domain,file,add_to_s3=True, add_to_local=True, file_object=True):
221
  '''
222
  this function receive the domain name and file obj
223
  and add the article to the cloud, s3 and local memory
 
238
  dict
239
  article object
240
  '''
241
+ if file_object:
242
+ content, _ = read_pdf(file)
243
+ name = file.name.split("\\")[-1].split(".")[0]
244
+ else:
245
+ content = file
246
+ name = f"temp_{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
247
+
248
+ abstract,_,end_abstract = extract_key_content(content,["objective","abstract"],["key","words:","methods"],["introduction"])
249
+ methods,_,end_methods = extract_key_content(content[end_abstract:],["methods"],["results"])
250
+ if not methods:
251
+ methods,_,end_methods = extract_key_content(content[end_abstract:],["methods"],["discussion"])
252
+ results,_,_ = extract_key_content(content[end_methods:],["results"],["discussion"])
253
+
254
  article ={
255
  "domain":domain,
256
+ "name":name,
257
  "content":content,
258
+ "abstract":abstract,
259
+ "methods":methods,
260
+ "results":results,
261
  "upload_time":datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
262
  }
263
 
264
+ if add_to_s3 and file_object:
265
+ s3_path = upload_fileobj(file,domain,article["name"])
266
  article["s3_path"] = s3_path
267
 
268
  if add_to_local:
 
335
 
336
  return article
337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  def add_output(output):
339
  '''
340
  this function add the output to the cloud
 
376
  return False
377
  return True
378
 
379
+ def add_device(*args):
380
  pass
381
 
382
  def get_device():
 
386
  pass
387
 
388
  def update_device():
389
+ pass
390
+
391
+ def process_feedback(text):
392
+ return text
393
+
394
+ def select_prompts(article,*args):
395
+ '''
396
+ select the prompts based on the content and the search terms
397
+ that was included in the content
398
+
399
+ Parameters
400
+ ----------
401
+ article : dict
402
+ article object
403
+
404
+ Returns
405
+ -------
406
+ dict
407
+ prompts
408
+ '''
409
+
410
+ # get template names based on the search terms
411
+ memory = set()
412
+ prompts = []
413
+ for t in app_data["terms"]:
414
+ t["terms"] = t["term"].split(",")
415
+ if reduce(mul, [s in article["content"] for s in t["terms"]], 1) and t["template_name"] not in memory:
416
+ # get prompts based from templates
417
+ template_names = t["template_name"].split(",")
418
+ for tn in template_names:
419
+ prompts.extend([p for p in app_data["prompts"] if p["template_name"]==tn])
420
+ prompts[-1]["prompt"].replace("<--clinical term-->",t["clinical term"])
421
+ prompts[-1]["prompt"].replace("<--radiologic term-->",t["clinical term"])
422
+ prompts[-1]["prompt"].replace("<--other term-->",t["clinical term"])
423
+
424
+ memory.add(t["template_name"])
425
+
426
+ # add overview prompts
427
+ prompts.extend([ov for ov in app_data["prompts"] if ov["assessment_step"]=="overview"])
428
+ # print("number of prompts",len(prompts))
429
+
430
+ # check if groups, levels and preopratives are in the article
431
+ article_logic = {}
432
+ for k,value in logic_keywords.items():
433
+ article_logic[k] = bool(sum([kw in article["content"] for kw in value]))
434
+ # print(article_logic)
435
+
436
+ # use article_logic to filter prompts
437
+ prompts = [p for p in prompts
438
+ if (p["groups"] == article_logic["groups"] or p["groups"] is None)
439
+ and (p["levels"] == article_logic["levels"] or p["levels"] is None)
440
+ and (p["preoperatives"] == article_logic["preoperatives"] or p["preoperatives"] is None)]
441
+
442
+
443
+ # print("number of prompts after logic",len(prompts))
444
+ # early return if no specific result
445
+ if "".join(args) == "":
446
+ # print("no args")
447
+ return prompts
448
+
449
+ # # performance metrics and safety metrics filter
450
+ # for p in prompts:
451
+ # if not sum([a in p["clinical term"] for a in args if a]):
452
+ # print(p["template_name"])
453
+ # prompts.remove(p)
454
+ # print("number of prompts after args",len(prompts))
455
+ return prompts
456
+
457
+ def keyword_search(keywords,full_text):
458
+ keywords_result = {}
459
+ for k in keywords:
460
+ if type(k) is tuple:
461
+ keywords_result[k]=list_or([keyword_search(kw,full_text) for kw in k])
462
+ else:
463
+ keywords_result[k]=keyword_search(k,full_text)
464
+ return keywords_result