Roland Ding commited on
Commit
362c1df
·
1 Parent(s): a5f3cfb

8.8.21.57 updated cloud_textract.py to replace default_s3_bucket instead of the manual string. Also added default_region as well as new app_data structure.

Browse files
Files changed (2) hide show
  1. application.py +19 -7
  2. cloud_textract.py +3 -3
application.py CHANGED
@@ -1,14 +1,17 @@
1
  import os
2
 
 
 
3
  '''
4
  shared environment variables
5
  '''
6
- default_device = "cervical-cage"
7
 
8
  aws_access_key_id = os.environ.get('AMRA_AWS_ACCESS_KEY_ID')
9
  aws_secret_access_key = os.environ.get('AMRA_AWS_SECRET_ACCESS_KEY')
10
  openai_api_key = os.environ.get('AMRA_OPENAI_API_KEY')
11
 
 
 
12
  device_options={
13
  "secondary extraction":False,
14
  "secondary extraction count":0
@@ -106,18 +109,27 @@ authors_inst=[
106
  f"return the results on the same line separated by commas.",
107
  ]
108
 
109
- accepted_date_inst=[
110
- f"extract the acceptance date of the article from the system text.",
111
- f"return the results on a single line as 'Accepted Date: <month>, <year>.",
112
  ]
113
 
 
 
 
 
 
 
 
 
 
114
  '''
115
  application default data
116
  '''
117
  app_data = {
118
- "current_article":{},
119
  "articles":[],
 
120
  "terms":[],
121
- "prompts":[],
122
- # "outputs":[]
123
  }
 
1
  import os
2
 
3
+ from collections import defaultdict
4
+
5
  '''
6
  shared environment variables
7
  '''
 
8
 
9
  aws_access_key_id = os.environ.get('AMRA_AWS_ACCESS_KEY_ID')
10
  aws_secret_access_key = os.environ.get('AMRA_AWS_SECRET_ACCESS_KEY')
11
  openai_api_key = os.environ.get('AMRA_OPENAI_API_KEY')
12
 
13
+ default_region = "Spine"
14
+
15
  device_options={
16
  "secondary extraction":False,
17
  "secondary extraction count":0
 
109
  f"return the results on the same line separated by commas.",
110
  ]
111
 
112
+ accepted_year_inst=[
113
+ f"extract the acceptance year of the article from the system text.",
114
+ f"return the results on a single line as 'Accepted Year: <year>.",
115
  ]
116
 
117
+ accepted_month_inst=[
118
+ f"extract the acceptance month of the article from the system text.",
119
+ f"return the results on a single line as 'Accepted Month: <month>.",
120
+ ]
121
+
122
+ abstract_inst=[
123
+ f"Extract the abstract of the article from the system text, and return its original text. Normally, the abstract is before the introduction and might a paragraph, or in sections of study design, objective, summary of background, methods, results, and conclusion with keywords section in the end.",
124
+ ]
125
+
126
  '''
127
  application default data
128
  '''
129
  app_data = {
130
+ "current article":{},
131
  "articles":[],
132
+ "prompts":{},
133
  "terms":[],
134
+ "paths":{}
 
135
  }
cloud_textract.py CHANGED
@@ -1,7 +1,7 @@
1
  import boto3
2
 
3
  from utility import terminal_print, create_md_table
4
- from application import aws_access_key_id, aws_secret_access_key
5
 
6
  textract = boto3.client(
7
  'textract',
@@ -190,7 +190,7 @@ def textract_output_to_table(table,blocks_dict):
190
  array.append([])
191
  cur_row = r_id
192
  if "Relationships" in c:
193
- words = [blocks_dict[i]["Text"] for i in c["Relationships"][0]["Ids"]]
194
  else:
195
  words =[""]
196
  # print(c["RowIndex"],c["ColumnIndex"]," ".join(words))
@@ -199,7 +199,7 @@ def textract_output_to_table(table,blocks_dict):
199
  return array
200
 
201
  @terminal_print
202
- def get_tables(filename:str,bucket:str="amra-studies"):
203
  '''
204
  This function is used to get the tables from the textract output
205
 
 
1
  import boto3
2
 
3
  from utility import terminal_print, create_md_table
4
+ from application import aws_access_key_id, aws_secret_access_key, default_s3_bucket
5
 
6
  textract = boto3.client(
7
  'textract',
 
190
  array.append([])
191
  cur_row = r_id
192
  if "Relationships" in c:
193
+ words = [blocks_dict[i]["Text"] for i in c["Relationships"][0]["Ids"] if blocks_dict[i]["BlockType"] == "WORD"]
194
  else:
195
  words =[""]
196
  # print(c["RowIndex"],c["ColumnIndex"]," ".join(words))
 
199
  return array
200
 
201
  @terminal_print
202
+ def get_tables(filename:str,bucket:str=default_s3_bucket):
203
  '''
204
  This function is used to get the tables from the textract output
205