Roland Ding commited on
Commit
fdccd1a
·
1 Parent(s): 352e336

Major backend_update

Browse files

UI update:
+ Revised ui_studies to:
- show all studies articles
- added control for multiple files upload
+ Revised ui_study to:
- update the article list in real time
- realigned the layout to make it more compact

Features:
+ added backend methods to upload multiple files
+ added post_process function to execute Futable commends
+ added retry_decorator for retrying failed openai inquiries
+ touchup for including futable results in extraction list for display
+ changed extraction collections to set
+ move the llm instance to global in chains.py
+ added root message for agent declaration before the first inquiry

Cloud:
+ updated get table for handling pagination.

Others:
+ touchup remove_symbols in utility.py for better filtering article content and characters.

cleanup:
+ removed cloud_textract.py

On branch main
Changes to be committed:
modified: chains.py
modified: cloud_db.py
deleted: cloud_textract.py
modified: features.py
modified: ui_studies.py
modified: ui_study.py
modified: utility.py

Files changed (7) hide show
  1. chains.py +22 -19
  2. cloud_db.py +6 -2
  3. cloud_textract.py +0 -230
  4. features.py +29 -8
  5. ui_studies.py +22 -20
  6. ui_study.py +5 -6
  7. utility.py +24 -281
chains.py CHANGED
@@ -2,16 +2,23 @@ import asyncio
2
  import openai
3
 
4
  from langchain.chat_models import ChatOpenAI
 
5
  from langchain.prompts.chat import ChatPromptTemplate
6
  from langchain.schema import BaseOutputParser
7
  from application import *
8
 
9
  from utility import read_pdf,aterminal_print
10
 
11
- class Replacement(BaseOutputParser):
12
- """Parse the output of an LLM call to a comma-separated list."""
 
 
 
 
13
 
14
 
 
 
15
  def parse(self, text: str, **kwargs):
16
  """Parse the output of an LLM call."""
17
  if kwargs:
@@ -21,24 +28,23 @@ class Replacement(BaseOutputParser):
21
  @aterminal_print # need to review this.
22
  async def async_generate(article,name,chain,replacement_term=None):
23
  if replacement_term:
24
- resp = await chain.ainvoke({"term":replacement_term})
25
  else:
26
- resp = await chain.ainvoke({"term":""})
27
- article[name] = resp.content
 
 
28
 
29
  @aterminal_print # need to review this.
 
30
  async def execute_concurrent(article,prompts):
31
- llm = ChatOpenAI(
32
- temperature=0.0,
33
- model_name="gpt-3.5-turbo-16k",
34
- openai_api_key=openai.api_key)
35
  tasks = []
36
 
37
  prompt_type = article["logic"]
38
  prompt_list = list(prompts.keys())
39
- print(prompt_list)
40
 
41
- # for name,p in prompts.items():
42
  while prompt_list:
43
  name = prompt_list.pop(0)
44
  p = prompts[name]
@@ -49,8 +55,11 @@ async def execute_concurrent(article,prompts):
49
 
50
  print("executing",p["assessment_step"],name)
51
  input_text = "".join([article[s] for s in p["input_list"]])
52
-
 
 
53
  chat_prompt = ChatPromptTemplate.from_messages([
 
54
  ("human",input_text),
55
  ("system",p[prompt_type]),
56
  ])
@@ -85,16 +94,10 @@ if __name__ == "__main__":
85
  sample_content,_ = read_pdf(sample_artice)
86
 
87
  llm = ChatOpenAI(temperature=0.0,model_name="gpt-3.5-turbo-16k")
88
- # with open(".prompts/other/Need for ICU.txt") as f:
89
- # prompt = f.read()
90
- # name = "Need for ICU"
91
  with open(".prompts/other/Operation Time.txt") as f:
92
  prompt = f.read()
93
  name = "Operation Time"
94
- # with open(".prompts/other/Blood Loss.txt") as f:
95
- # prompt = f.read()
96
- # name = "Blood Loss"
97
-
98
  post_prompt_maping = {}
99
  post_replace_term = lambda res,map=post_prompt_maping:replace_term(res,map=map)
100
 
 
2
  import openai
3
 
4
  from langchain.chat_models import ChatOpenAI
5
+ from langchain.chat_models.openai import _create_retry_decorator
6
  from langchain.prompts.chat import ChatPromptTemplate
7
  from langchain.schema import BaseOutputParser
8
  from application import *
9
 
10
  from utility import read_pdf,aterminal_print
11
 
12
+ llm = ChatOpenAI(
13
+ temperature=0.0,
14
+ model_name="gpt-3.5-turbo-16k",
15
+ openai_api_key=openai.api_key)
16
+
17
+ retry_decorator = _create_retry_decorator(llm)
18
 
19
 
20
+ class Replacement(BaseOutputParser):
21
+ """Parse the output of an LLM call to a comma-separated list."""
22
  def parse(self, text: str, **kwargs):
23
  """Parse the output of an LLM call."""
24
  if kwargs:
 
28
  @aterminal_print # need to review this.
29
  async def async_generate(article,name,chain,replacement_term=None):
30
  if replacement_term:
31
+ res = await chain.ainvoke({"term":replacement_term})
32
  else:
33
+ res = await chain.ainvoke({"term":""})
34
+
35
+ print("completed",name)
36
+ article[name] = res.content
37
 
38
  @aterminal_print # need to review this.
39
+ @retry_decorator
40
  async def execute_concurrent(article,prompts):
41
+
 
 
 
42
  tasks = []
43
 
44
  prompt_type = article["logic"]
45
  prompt_list = list(prompts.keys())
 
46
 
47
+ i = 0
48
  while prompt_list:
49
  name = prompt_list.pop(0)
50
  p = prompts[name]
 
55
 
56
  print("executing",p["assessment_step"],name)
57
  input_text = "".join([article[s] for s in p["input_list"]])
58
+ # with open(f".outputs/{i}_{name}.txt","w+") as f:
59
+ # f.write(input_text)
60
+ # f.write(p[prompt_type])
61
  chat_prompt = ChatPromptTemplate.from_messages([
62
+ ("system","You are a helpful AI that can answer questions about clinical trail and operation studies."),
63
  ("human",input_text),
64
  ("system",p[prompt_type]),
65
  ])
 
94
  sample_content,_ = read_pdf(sample_artice)
95
 
96
  llm = ChatOpenAI(temperature=0.0,model_name="gpt-3.5-turbo-16k")
 
 
 
97
  with open(".prompts/other/Operation Time.txt") as f:
98
  prompt = f.read()
99
  name = "Operation Time"
100
+
 
 
 
101
  post_prompt_maping = {}
102
  post_replace_term = lambda res,map=post_prompt_maping:replace_term(res,map=map)
103
 
cloud_db.py CHANGED
@@ -17,8 +17,12 @@ dynamodb data operations
17
  # get the list of articles from articles table in dynamodb
18
  @terminal_print
19
  def get_table(table_name:str):
20
- result = db_client.scan(TableName = table_name)#,AttributesToGet = data_structure[table_name]["fields"])
21
- return [db_map_to_py_dict(r) for r in result["Items"]]
 
 
 
 
22
 
23
  # add a new article to table articles in dynamodb, return error if failed
24
  def post_item(table_name:str,item:dict):
 
17
  # get the list of articles from articles table in dynamodb
18
  @terminal_print
19
  def get_table(table_name:str):
20
+ result = db_client.scan(TableName = table_name)
21
+ items = result["Items"]
22
+ while "LastEvaluatedKey" in result:
23
+ result = db_client.scan(TableName = table_name,ExclusiveStartKey = result["LastEvaluatedKey"])
24
+ items.extend(result["Items"])
25
+ return [db_map_to_py_dict(r) for r in items]
26
 
27
  # add a new article to table articles in dynamodb, return error if failed
28
  def post_item(table_name:str,item:dict):
cloud_textract.py DELETED
@@ -1,230 +0,0 @@
1
- import boto3
2
-
3
- from utility import terminal_print, create_md_table
4
- from application import aws_access_key_id, aws_secret_access_key, default_s3_bucket
5
-
6
- textract = boto3.client(
7
- 'textract',
8
- aws_access_key_id=aws_access_key_id,
9
- aws_secret_access_key=aws_secret_access_key,
10
- region_name='us-east-1')
11
-
12
- @terminal_print
13
- def textract_get_tables(res_tables,textract=textract):
14
- '''
15
- This function is used to get the tables from the textract output
16
-
17
- Parameters:
18
- res_tables: the output from the textract.get_document_analysis function
19
- textract: the boto3 client for textract
20
-
21
- Returns:
22
- result: the cascaded output with blocks from the textract.get_document_analysis function
23
- '''
24
- job_id = res_tables["JobId"]
25
- temp = result = res_tables.copy()
26
-
27
- while "NextToken" in temp:
28
- temp = textract.get_document_analysis(JobId=job_id,NextToken=temp["NextToken"])
29
- result["Blocks"].extend(temp["Blocks"])
30
-
31
- return result
32
-
33
- @terminal_print
34
- def textract_get_text(res_text,textract=textract):
35
- '''
36
- This function is used to get the text from the textract output
37
-
38
- Parameters:
39
- res_text: the output from the textract.get_document_text_detection function
40
- textract: the boto3 client for textract
41
-
42
- Returns:
43
- result: the cascaded output with blocks from the textract.get_document_text_detection function
44
- '''
45
- job_id = res_text["JobId"]
46
- temp = result = res_text.copy()
47
-
48
- while "NextToken" in temp:
49
- temp = textract.get_document_text_detection(JobId=job_id,NextToken=temp["NextToken"])
50
- result["Blocks"].extend(temp["Blocks"])
51
-
52
- return result
53
-
54
- @terminal_print
55
- def get_article_tables(file_name:str,bucket:str,delay:int=5):
56
- '''
57
- This function is used to get the tables from the textract output
58
-
59
- Parameters:
60
- file_name: the name of the file in the bucket
61
- bucket: the name of the bucket
62
- delay: the delay time for the textract.get_document_analysis function
63
-
64
- Returns:
65
- res_tables: the output from the textract.get_document_analysis function with initial blocks
66
- '''
67
- import time
68
- # need to use async method to process the files
69
- job_tables = textract.start_document_analysis(
70
- DocumentLocation={
71
- "S3Object":{
72
- "Bucket":bucket,
73
- "Name": file_name
74
- }
75
- },
76
- FeatureTypes=["TABLES"]
77
- )
78
-
79
- table_job_id = job_tables["JobId"]
80
- res_tables = {"JobStatus":"IN_PROGRESS"}
81
-
82
- while res_tables["JobStatus"] == "IN_PROGRESS":
83
- time.sleep(delay)
84
- res_tables = textract.get_document_analysis(JobId=table_job_id)
85
-
86
- res_tables["JobId"] = table_job_id
87
-
88
- return res_tables
89
-
90
- @terminal_print
91
- def get_article_text(file_name:str,bucket:str,delay:int=5):
92
- '''
93
- This function is used to get the text from the textract output
94
-
95
- Parameters:
96
- file_name: the name of the file in the bucket
97
- bucket: the name of the bucket
98
- delay: the delay time for the textract.get_document_text_detection function
99
-
100
- Returns:
101
- res_text: the output from the textract.get_document_text_detection function with initial blocks
102
- '''
103
- import time
104
- job_text = textract.start_document_text_detection(
105
- DocumentLocation={
106
- "S3Object":{
107
- "Bucket":bucket,
108
- "Name": file_name
109
- }
110
- }
111
- )
112
-
113
- text_job_id = job_text["JobId"]
114
- res_text = {"JobStatus":"IN_PROGRESS"}
115
-
116
- while res_text["JobStatus"] == "IN_PROGRESS":
117
- time.sleep(delay)
118
- if res_text["JobStatus"] == "IN_PROGRESS":
119
- res_text = textract.get_document_text_detection(JobId=text_job_id)
120
-
121
-
122
- res_text["JobId"] = text_job_id
123
-
124
- return res_text
125
-
126
- @terminal_print
127
- def construct_tables(tables):
128
- '''
129
- This function is used to construct the tables from the textract output
130
-
131
- Parameters:
132
- tables: the output from the textract.get_document_analysis function
133
-
134
- Returns:
135
- table_blocks: the list of tables with the blocks
136
- blocks_dict: the dictionary of blocks with the block id as the key
137
- '''
138
- blocks = tables["Blocks"]
139
-
140
- blocks_dict = {}
141
- table_blocks = []
142
-
143
- for b in blocks:
144
-
145
- blocks_dict[b["Id"]] = b
146
-
147
- if b["BlockType"] == "TABLE":
148
- temp = {
149
- "id":b["Id"],
150
- "relationship":b["Relationships"],
151
- "confidence":b["Confidence"],
152
- "page":b["Page"],
153
- "map":{}
154
- }
155
- table_blocks.append(temp)
156
-
157
- for t in table_blocks:
158
- for e in t["relationship"]:
159
- t["map"].update({id:{"Type":e["Type"]} for id in e["Ids"]})
160
-
161
- for id in t["map"]:
162
- component = blocks_dict[id]
163
- if component["BlockType"] not in t:
164
- t[component["BlockType"]] = []
165
- t[component["BlockType"]].append(component)
166
-
167
- # table_blocks.append(t)
168
-
169
- return table_blocks, blocks_dict
170
-
171
- # Transfer the table blocks from aws textract into a table
172
- @terminal_print
173
- def textract_output_to_table(table,blocks_dict):
174
- '''
175
- This function is used to transfer the table blocks from aws textract into a table
176
-
177
- Parameters:
178
- table: the table block from the textract output
179
- blocks_dict: the dictionary of blocks with the block id as the key
180
-
181
- Returns:
182
- array: the table array with the text from the table blocks
183
- '''
184
- array = [[]]
185
- cur_row = 1
186
- for c in table["CELL"]:
187
- r_id = c["RowIndex"]
188
-
189
- if r_id > cur_row:
190
- array.append([])
191
- cur_row = r_id
192
- if "Relationships" in c:
193
- words = [blocks_dict[i]["Text"] for i in c["Relationships"][0]["Ids"] if blocks_dict[i]["BlockType"] == "WORD"]
194
- else:
195
- words =[""]
196
- # print(c["RowIndex"],c["ColumnIndex"]," ".join(words))
197
- array[-1].append(" ".join(words))
198
-
199
- return array
200
-
201
- @terminal_print
202
- def get_tables(filename:str,bucket:str=default_s3_bucket):
203
- '''
204
- This function is used to get the tables from the textract output
205
-
206
- Parameters:
207
- filename: the name of the file in the bucket
208
- bucket: the name of the bucket
209
-
210
- Returns:
211
- md_tables: the list of tables in markdown format
212
- '''
213
- tables_temp = get_article_tables(file_name=filename,bucket=bucket)
214
-
215
- tables = textract_get_tables(tables_temp)
216
- table_blocks,block_dict = construct_tables(tables)
217
-
218
- md_tables = []
219
-
220
- # review table and exclude the reference table if any
221
-
222
- for table in table_blocks:
223
- table_array = textract_output_to_table(table,block_dict)
224
- md_tables.append(create_md_table(table_array))
225
-
226
- return md_tables
227
-
228
- def is_reference_table(table):
229
- return
230
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
features.py CHANGED
@@ -69,10 +69,7 @@ def process_study( # need revision
69
  ):
70
 
71
  if study_file_obj:
72
- if type(study_file_obj) is list:
73
- article = add_article(domain,study_file_obj[0])
74
- else:
75
- article = add_article(domain,study_file_obj)
76
  elif study_content:
77
  article = add_article(domain,study_content,file_object=False)
78
  else:
@@ -89,6 +86,7 @@ def process_study( # need revision
89
 
90
  # set the current article to the completed article object
91
  app_data["current_article"] = article
 
92
 
93
  # update the article to the cloud
94
  try:
@@ -103,6 +101,29 @@ def process_study( # need revision
103
 
104
  return overview, detail_views
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  @terminal_print
107
  def update_article_segment(article):
108
  # get the key content between article objective and discussion
@@ -502,8 +523,8 @@ def select_performance_prompts(article,performance_assessment):
502
  else:
503
  valid_prompts[p]["term"].update({t["term"]:t})
504
  if performance_assessment not in article["extraction"]:
505
- article["extraction"][performance_assessment] = []
506
- article["extraction"][performance_assessment].append(prompt["prompt_name"])
507
 
508
  return valid_prompts
509
 
@@ -633,7 +654,7 @@ def run_executor(article,prompt):
633
  case "f_summary_term":
634
  f_summary_term(article,prompt)
635
 
636
-
637
  @terminal_print
638
  def post_process(article):
639
  post_inputs = {}
@@ -657,7 +678,7 @@ def post_process(article):
657
  for assessment,post_input in post_inputs.items():
658
  instruction_agg = app_data["prompts_agg"][assessment]
659
  article[instruction_agg["name"]] = chain.invoke({"text":post_input,"instruction":instruction_agg["chain"][0]}).content
660
- article["extraction"][assessment].append(instruction_agg["name"])
661
 
662
 
663
  def add_inst(instructions,prompt):
 
69
  ):
70
 
71
  if study_file_obj:
72
+ article = add_article(domain,study_file_obj)
 
 
 
73
  elif study_content:
74
  article = add_article(domain,study_content,file_object=False)
75
  else:
 
86
 
87
  # set the current article to the completed article object
88
  app_data["current_article"] = article
89
+ app_data["articles"][article["name"]] = article
90
 
91
  # update the article to the cloud
92
  try:
 
101
 
102
  return overview, detail_views
103
 
104
+ @terminal_print
105
+ def process_studies(
106
+ domain,
107
+ file_objs):
108
+
109
+ for file_obj in file_objs:
110
+ process_study(domain,file_obj,None)
111
+ return gr.update(value=create_md_tables(app_data["articles"]))
112
+
113
+ @terminal_print
114
+ def create_md_tables(articles):
115
+ '''
116
+ create markdown tables for the articles.
117
+ '''
118
+ md_text = ""
119
+ md_text += "| Article Name | Authors | Domain | Upload Time |\n| --- | --- | --- | --- |\n"
120
+
121
+ for name, article in articles.items():
122
+ md_table = f"| {name} | {article['Authors']} |{article['domain']} | {article['upload_time']} | \n"
123
+ md_text += md_table
124
+
125
+ return md_text
126
+
127
  @terminal_print
128
  def update_article_segment(article):
129
  # get the key content between article objective and discussion
 
523
  else:
524
  valid_prompts[p]["term"].update({t["term"]:t})
525
  if performance_assessment not in article["extraction"]:
526
+ article["extraction"][performance_assessment] = set()
527
+ article["extraction"][performance_assessment].add(prompt["prompt_name"])
528
 
529
  return valid_prompts
530
 
 
654
  case "f_summary_term":
655
  f_summary_term(article,prompt)
656
 
657
+ @retry_decorator
658
  @terminal_print
659
  def post_process(article):
660
  post_inputs = {}
 
678
  for assessment,post_input in post_inputs.items():
679
  instruction_agg = app_data["prompts_agg"][assessment]
680
  article[instruction_agg["name"]] = chain.invoke({"text":post_input,"instruction":instruction_agg["chain"][0]}).content
681
+ article["extraction"][assessment].add(instruction_agg["name"])
682
 
683
 
684
  def add_inst(instructions,prompt):
ui_studies.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
 
3
  from application import *
4
- from features import init_app_data
5
  from utility import terminal_print
6
 
7
  def refresh():
@@ -11,29 +11,31 @@ def refresh():
11
  '''
12
  return create_md_tables(app_data["articles"])
13
 
14
-
15
- def create_md_tables(articles):
16
- '''
17
- create markdown tables for the articles.
18
- '''
19
- md_text = ""
20
- md_text += "| Domain | File Name | Upload Time | Device |\n| --- | --- | --- | --- |\n"
21
-
22
- for article in articles:
23
- md_table = f"| {article['domain']} | {article['name']} | {article['upload_time']} | {default_region} |\n"
24
- md_text += md_table
25
-
26
- return md_text
27
-
28
  @terminal_print
29
  def init_studies_page():
30
  with gr.Blocks() as studies_page:
31
- with gr.Row():
32
- gr.Markdown("## Article Lists")
33
- btn_refresh = gr.Button(value="Refresh",variant="primary")
34
- gr.HTML("<hr>")
 
 
 
 
 
 
 
35
 
36
- article_list = gr.Markdown("")
 
 
 
 
 
 
 
 
 
37
 
38
  btn_refresh.click(
39
  fn=refresh,
 
1
  import gradio as gr
2
 
3
  from application import *
4
+ from features import init_app_data,process_studies,create_md_tables
5
  from utility import terminal_print
6
 
7
  def refresh():
 
11
  '''
12
  return create_md_tables(app_data["articles"])
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  @terminal_print
15
  def init_studies_page():
16
  with gr.Blocks() as studies_page:
17
+ with gr.Row(equal_height=False):
18
+ with gr.Column():
19
+ gr.Markdown("## Clinical Studies")
20
+ domain = gr.Radio(label="Anatomical Region",choices=anatomic_domains,value=default_region)
21
+ upload_studies = gr.File(label="Upload clinical study reports",type="file",file_count="multiple")
22
+ btn_upload_studies = gr.Button(value="Upload",variant="primary")
23
+ with gr.Column():
24
+ gr.Markdown("## Article Lists")
25
+ btn_refresh = gr.Button(value="Refresh",variant="primary")
26
+ gr.HTML("<hr>")
27
+ article_list = gr.Markdown("")
28
 
29
+ btn_upload_studies.click(
30
+ process_studies,
31
+ inputs=[
32
+ domain,
33
+ upload_studies,
34
+ ],
35
+ outputs=[
36
+ article_list,
37
+ ],
38
+ )
39
 
40
  btn_refresh.click(
41
  fn=refresh,
ui_study.py CHANGED
@@ -25,16 +25,15 @@ def init_study_page():
25
  with gr.Column():
26
  gr.Markdown("## Studies")
27
  gr.HTML("<hr>")
28
-
29
- upload_study = gr.File(label="Upload a clinical study report",type="file",file_count="multiple")
30
-
31
-
32
- with gr.Column():
33
  domain = gr.Radio(label="Anatomical Region",choices=anatomic_domains,value=default_region)
34
- input_study = gr.TextArea(label="Or paste a clinical study report content",placeholder="Paste content here...",lines=5)
35
  with gr.Row():
36
  btn_reset = gr.Button(value="Reset",variant="stop")
37
  btn_add_study = gr.Button(value="Add",variant="primary")
 
 
 
 
38
 
39
  gr.HTML("<hr>")
40
  with gr.Row():
 
25
  with gr.Column():
26
  gr.Markdown("## Studies")
27
  gr.HTML("<hr>")
 
 
 
 
 
28
  domain = gr.Radio(label="Anatomical Region",choices=anatomic_domains,value=default_region)
29
+ upload_study = gr.File(label="Upload a clinical study report",type="file",file_count="single")
30
  with gr.Row():
31
  btn_reset = gr.Button(value="Reset",variant="stop")
32
  btn_add_study = gr.Button(value="Add",variant="primary")
33
+
34
+ with gr.Column():
35
+ input_study = gr.TextArea(label="Or paste a clinical study report content",placeholder="Paste content here...",lines=5)
36
+
37
 
38
  gr.HTML("<hr>")
39
  with gr.Row():
utility.py CHANGED
@@ -50,25 +50,11 @@ def terminal_print(func):
50
  following functions are for file manipulation
51
  '''
52
 
53
- @terminal_print
54
  @terminal_print
55
  def read_pdf(file_path):
56
  '''
57
  this function read the pdf file and return the text
58
 
59
- Parameters
60
- ----------
61
- file_path : str
62
- path to the pdf file
63
-
64
- Returns
65
- -------
66
- text : str
67
- text extracted from the pdf file
68
- '''
69
- '''
70
- this function read the pdf file and return the text
71
-
72
  Parameters
73
  ----------
74
  file_path : str
@@ -83,11 +69,6 @@ def read_pdf(file_path):
83
  if type(file_path) is str:
84
  file_obj = open(file_path, 'rb')
85
  # elif type(file_path) is tempfile._TemporaryFileWrapper:
86
- else:
87
- file_obj = open(file_path.name, 'rb')
88
- if type(file_path) is str:
89
- file_obj = open(file_path, 'rb')
90
- # elif type(file_path) is tempfile._TemporaryFileWrapper:
91
  else:
92
  file_obj = open(file_path.name, 'rb')
93
 
@@ -98,22 +79,11 @@ def read_pdf(file_path):
98
  parser = PDFParser(file_obj)
99
  doc = PDFDocument(parser)
100
 
101
- meta = doc.info
102
- text = extract_text(file_obj)
103
- text = remove_symbols(text)
104
- text = remove_citation(text)
105
-
106
- parser = PDFParser(file_obj)
107
- doc = PDFDocument(parser)
108
-
109
  meta = doc.info
110
  # close the pdf file object
111
  file_obj.close()
112
 
113
  return text, meta
114
- file_obj.close()
115
-
116
- return text, meta
117
 
118
  '''
119
  following functions are for format standard response
@@ -136,22 +106,7 @@ def format_response(code,data):
136
  dict
137
  formatted response
138
  '''
139
- '''
140
- this function format the response to be returned to the client.
141
- this is used for lambda serverless framework to return the response.
142
 
143
- Parameters
144
- ----------
145
- code : int
146
- status code
147
- data : dict
148
- data to be returned to the client
149
-
150
- Returns
151
- -------
152
- dict
153
- formatted response
154
- '''
155
  return {
156
  "statusCode":code,
157
  "headers":{
@@ -171,19 +126,6 @@ def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]):
171
  '''
172
  this function format the text output by removing excessive characters
173
 
174
- Parameters
175
- ----------
176
- text : str
177
- text to be processed
178
-
179
- Returns
180
- -------
181
- str
182
- processed text
183
- '''
184
- '''
185
- this function format the text output by removing excessive characters
186
-
187
  Parameters
188
  ----------
189
  text : str
@@ -199,7 +141,6 @@ def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]):
199
 
200
  return text
201
 
202
- @terminal_print
203
  @terminal_print
204
  def remove_symbols(text):
205
  '''
@@ -215,20 +156,8 @@ def remove_symbols(text):
215
  str
216
  processed text
217
  '''
218
- '''
219
- this function remove symbols that are not in unicode
220
-
221
- Parameters
222
- ----------
223
- text : str
224
- text to be processed
225
-
226
- Returns
227
- -------
228
- str
229
- processed text
230
- '''
231
- text = text.encode("ascii", "ignore").decode()
232
  text = text.replace('-\n', '')
233
  return text
234
 
@@ -249,42 +178,11 @@ def remove_citation(text):
249
  '''
250
  return re.sub(r'\(cid:\d+\)','',text)
251
 
252
- @terminal_print
253
- @terminal_print
254
- def remove_citation(text):
255
- '''
256
- this function remove citation pattern in the text
257
-
258
- Parameters
259
- ----------
260
- text : str
261
- text to be processed
262
-
263
- Returns
264
- -------
265
- str
266
- processed text
267
- '''
268
- return re.sub(r'\(cid:\d+\)','',text)
269
-
270
  @terminal_print
271
  def str_to_tuple(s):
272
  '''
273
  this function convert string to tuple
274
 
275
- Parameters
276
- ----------
277
- s : str
278
- string to be converted
279
-
280
- Returns
281
- -------
282
- tuple
283
- converted tuple
284
- '''
285
- '''
286
- this function convert string to tuple
287
-
288
  Parameters
289
  ----------
290
  s : str
@@ -312,53 +210,28 @@ def replace_symbols(s):
312
  str
313
  replaced string
314
  '''
315
- s = s.replace(" ","_")
316
- s = s.replace(",","")
317
- s = s.replace(".","")
318
- s = s.replace("-","_")
319
- s = s.replace("(","")
320
- s = s.replace(")","")
321
- s = s.replace("/","_")
322
- s = s.replace(":","")
323
- s = s.replace(";","")
324
- s = s.replace("'","")
325
- s = s.replace('"',"")
326
- return s
327
-
328
- @terminal_print
329
- def replace_symbols(s):
330
- '''
331
- this function replace symbols in the string to comply with file names
332
-
333
- Parameters
334
- ----------
335
- s : str
336
- string to be replaced
337
-
338
- Returns
339
- -------
340
- str
341
- replaced string
342
- '''
343
- s = s.replace(" ","_")
344
- s = s.replace(",","")
345
- s = s.replace(".","")
346
- s = s.replace("-","_")
347
- s = s.replace("(","")
348
- s = s.replace(")","")
349
- s = s.replace("/","_")
350
- s = s.replace(":","")
351
- s = s.replace(";","")
352
- s = s.replace("'","")
353
- s = s.replace('"',"")
354
  return s
355
 
356
  '''
357
  following functions are for dynamodb data manipulation
358
  '''
359
 
360
- # @terminal_print
361
-
362
  # @terminal_print
363
  def db_map_to_py_dict(db_map):
364
  '''
@@ -374,19 +247,7 @@ def db_map_to_py_dict(db_map):
374
  dict
375
  python dictionary
376
  '''
377
- '''
378
- this function convert dynamodb map data structure to python dictionary
379
-
380
- Parameters
381
- ----------
382
- db_map : dict
383
- dynamodb map
384
-
385
- Returns
386
- -------
387
- dict
388
- python dictionary
389
- '''
390
  py_dict = {}
391
  for k,i in db_map.items():
392
  for l,v in i.items():
@@ -404,36 +265,16 @@ def db_map_to_py_dict(db_map):
404
  py_dict[k] = v
405
  elif l =="NULL":
406
  py_dict[k] = None
407
- elif l == "BS":
408
- py_dict[k] = v
409
- elif l == "BOOL":
410
- py_dict[k] = v
411
- elif l =="NULL":
412
- py_dict[k] = None
413
  else:
414
  py_dict[k] = v
415
 
416
  return py_dict
417
 
418
- # @terminal_print
419
  # @terminal_print
420
  def py_dict_to_db_map(py_dict):
421
  '''
422
  this function convert python dictionary to dynamodb map data structure
423
 
424
- Parameters
425
- ----------
426
- py_dict : dict
427
- python dictionary
428
-
429
- Returns
430
- -------
431
- dict
432
- dynamodb map
433
- '''
434
- '''
435
- this function convert python dictionary to dynamodb map data structure
436
-
437
  Parameters
438
  ----------
439
  py_dict : dict
@@ -464,35 +305,13 @@ def py_dict_to_db_map(py_dict):
464
  db_map[key] = {"NULL":True}
465
  elif type(value) is set:
466
  db_map[key] = {"L":py_list_to_db_list(value)}
467
- elif type(value) is bytes:
468
- db_map[key] = {"B":value}
469
- elif type(value) is bool:
470
- db_map[key] = {"BOOL":value}
471
- elif value is None:
472
- db_map[key] = {"NULL":True}
473
- elif type(value) is set:
474
- db_map[key] = {"L":py_list_to_db_list(value)}
475
  return db_map
476
 
477
- # @terminal_print
478
  # @terminal_print
479
  def db_list_to_py_list(db_list):
480
  '''
481
  this function convert dynamodb list data structure to python list
482
 
483
- Parameters
484
- ----------
485
- db_list : list
486
- dynamodb list
487
-
488
- Returns
489
- -------
490
- list
491
- python list
492
- '''
493
- '''
494
- this function convert dynamodb list data structure to python list
495
-
496
  Parameters
497
  ----------
498
  db_list : list
@@ -517,43 +336,20 @@ def db_list_to_py_list(db_list):
517
  py_list.append(int(v))
518
  elif t =="S" or t =="BOOL" or t =="SS" or t =="NS":
519
  py_list.append(v)
520
- elif t =="N":
521
- if "." in v:
522
- py_list.append(float(v))
523
- else:
524
- py_list.append(int(v))
525
- elif t =="S" or t =="BOOL" or t =="SS" or t =="NS":
526
- py_list.append(v)
527
  elif t =="B" or t =="BS":
528
  py_list.append(bytes(v,"utf-8"))
529
  elif t =="NULL":
530
  py_list.append(None)
531
- elif t =="BOOL":
532
- py_list.append(bool(v))
533
  else:
534
  py_list.append(db_map_to_py_dict(v))
535
 
536
  return py_list
537
 
538
- # @terminal_print
539
  # @terminal_print
540
  def py_list_to_db_list(py_list):
541
  '''
542
  this function convert python list to dynamodb list data structure
543
 
544
- Parameters
545
- ----------
546
- py_list : list
547
- python list
548
-
549
- Returns
550
- -------
551
- list
552
- dynamodb list
553
- '''
554
- '''
555
- this function convert python list to dynamodb list data structure
556
-
557
  Parameters
558
  ----------
559
  py_list : list
@@ -570,12 +366,8 @@ def py_list_to_db_list(py_list):
570
  item = {"S":value}
571
  elif type(value) is int or type(value) is float:
572
  item = {"N":str(value)}
573
- elif type(value) is int or type(value) is float:
574
- item = {"N":str(value)}
575
  elif type(value) is dict:
576
  item = {"M":py_dict_to_db_map(value)}
577
- # item = py_dict_to_db_map(value)
578
- # item = py_dict_to_db_map(value)
579
  elif type(value) is list:
580
  item = {"L":py_list_to_db_list(value)}
581
  elif type(value) is tuple:
@@ -588,66 +380,17 @@ def py_list_to_db_list(py_list):
588
  item = {"NULL":True}
589
  elif type(value) is set:
590
  item = {"L":py_list_to_db_list(value)}
591
- elif type(value) is tuple:
592
- item = {"L":py_list_to_db_list(value)}
593
- elif type(value) is bytes:
594
- item = {"B":value}
595
- elif type(value) is bool:
596
- item = {"BOOL":value}
597
- elif value is None:
598
- item = {"NULL":True}
599
 
600
  db_list.append(item)
601
 
602
  return db_list
603
 
604
  def list_dict_to_dict(ls,key):
605
- result_dict = {}
606
- for d in ls:
607
- if key in d:
608
- result_dict[d[key]] = d
609
- return result_dict
610
-
611
- '''
612
- following functions are for markdown table creation
613
- '''
614
-
615
- @terminal_print
616
- def create_md_table(array):
617
- '''
618
- create markdown tables for an array.
619
-
620
- Parameters
621
- ----------
622
- array: list
623
- a table in the form of a list of lists
624
-
625
- Returns
626
- -------
627
- md_table: str
628
- '''
629
- md_table = ""
630
-
631
- for i,row in enumerate(array):
632
- md_row = ""
633
- for item in row:
634
- md_item = f"| {item} "
635
- md_row += md_item
636
-
637
- md_row += "|\n"
638
- md_table += md_row
639
-
640
- if i == 0:
641
- md_table += f"| {' | '.join(['---' for _ in range(len(row))])} |\n"
642
-
643
- return md_table
644
-
645
- def list_dict_to_dict(ls,key):
646
- result_dict = {}
647
- for d in ls:
648
- if key in d:
649
- result_dict[d[key]] = d
650
- return result_dict
651
 
652
  '''
653
  following functions are for markdown table creation
 
50
  following functions are for file manipulation
51
  '''
52
 
 
53
  @terminal_print
54
  def read_pdf(file_path):
55
  '''
56
  this function read the pdf file and return the text
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  Parameters
59
  ----------
60
  file_path : str
 
69
  if type(file_path) is str:
70
  file_obj = open(file_path, 'rb')
71
  # elif type(file_path) is tempfile._TemporaryFileWrapper:
 
 
 
 
 
72
  else:
73
  file_obj = open(file_path.name, 'rb')
74
 
 
79
  parser = PDFParser(file_obj)
80
  doc = PDFDocument(parser)
81
 
 
 
 
 
 
 
 
 
82
  meta = doc.info
83
  # close the pdf file object
84
  file_obj.close()
85
 
86
  return text, meta
 
 
 
87
 
88
  '''
89
  following functions are for format standard response
 
106
  dict
107
  formatted response
108
  '''
 
 
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  return {
111
  "statusCode":code,
112
  "headers":{
 
126
  '''
127
  this function format the text output by removing excessive characters
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  Parameters
130
  ----------
131
  text : str
 
141
 
142
  return text
143
 
 
144
  @terminal_print
145
  def remove_symbols(text):
146
  '''
 
156
  str
157
  processed text
158
  '''
159
+ import re
160
+ text = re.sub(r"[^a-zA-Z0-9\n\r]+", ' ', text)
 
 
 
 
 
 
 
 
 
 
 
 
161
  text = text.replace('-\n', '')
162
  return text
163
 
 
178
  '''
179
  return re.sub(r'\(cid:\d+\)','',text)
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  @terminal_print
182
  def str_to_tuple(s):
183
  '''
184
  this function convert string to tuple
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  Parameters
187
  ----------
188
  s : str
 
210
  str
211
  replaced string
212
  '''
213
+ symbols_map = {
214
+ " ":"_",
215
+ ",":"",
216
+ ".":"",
217
+ "-":"_",
218
+ "(":"",
219
+ ")":"",
220
+ "/":"_",
221
+ ":":"",
222
+ ";":"",
223
+ "'":"",
224
+ '"':""
225
+ }
226
+
227
+ for symbol in symbols_map:
228
+ s = s.replace(symbol,symbols_map[symbol])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  return s
230
 
231
  '''
232
  following functions are for dynamodb data manipulation
233
  '''
234
 
 
 
235
  # @terminal_print
236
  def db_map_to_py_dict(db_map):
237
  '''
 
247
  dict
248
  python dictionary
249
  '''
250
+
 
 
 
 
 
 
 
 
 
 
 
 
251
  py_dict = {}
252
  for k,i in db_map.items():
253
  for l,v in i.items():
 
265
  py_dict[k] = v
266
  elif l =="NULL":
267
  py_dict[k] = None
 
 
 
 
 
 
268
  else:
269
  py_dict[k] = v
270
 
271
  return py_dict
272
 
 
273
  # @terminal_print
274
  def py_dict_to_db_map(py_dict):
275
  '''
276
  this function convert python dictionary to dynamodb map data structure
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  Parameters
279
  ----------
280
  py_dict : dict
 
305
  db_map[key] = {"NULL":True}
306
  elif type(value) is set:
307
  db_map[key] = {"L":py_list_to_db_list(value)}
 
 
 
 
 
 
 
 
308
  return db_map
309
 
 
310
  # @terminal_print
311
  def db_list_to_py_list(db_list):
312
  '''
313
  this function convert dynamodb list data structure to python list
314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  Parameters
316
  ----------
317
  db_list : list
 
336
  py_list.append(int(v))
337
  elif t =="S" or t =="BOOL" or t =="SS" or t =="NS":
338
  py_list.append(v)
 
 
 
 
 
 
 
339
  elif t =="B" or t =="BS":
340
  py_list.append(bytes(v,"utf-8"))
341
  elif t =="NULL":
342
  py_list.append(None)
 
 
343
  else:
344
  py_list.append(db_map_to_py_dict(v))
345
 
346
  return py_list
347
 
 
348
  # @terminal_print
349
  def py_list_to_db_list(py_list):
350
  '''
351
  this function convert python list to dynamodb list data structure
352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  Parameters
354
  ----------
355
  py_list : list
 
366
  item = {"S":value}
367
  elif type(value) is int or type(value) is float:
368
  item = {"N":str(value)}
 
 
369
  elif type(value) is dict:
370
  item = {"M":py_dict_to_db_map(value)}
 
 
371
  elif type(value) is list:
372
  item = {"L":py_list_to_db_list(value)}
373
  elif type(value) is tuple:
 
380
  item = {"NULL":True}
381
  elif type(value) is set:
382
  item = {"L":py_list_to_db_list(value)}
 
 
 
 
 
 
 
 
383
 
384
  db_list.append(item)
385
 
386
  return db_list
387
 
388
  def list_dict_to_dict(ls,key):
389
+ if all([key in d for d in ls]):
390
+ return {d[key]:d for d in ls}
391
+ else:
392
+ print("key not found in all dictionaries")
393
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
  '''
396
  following functions are for markdown table creation