Roland Ding commited on
Commit ·
362c1df
1
Parent(s): a5f3cfb
8.8.21.57 updated cloud_textract.py to replace default_s3_bucket instead of the manual string. Also added default_region as well as new app_data structure.
Browse files- application.py +19 -7
- cloud_textract.py +3 -3
application.py
CHANGED
|
@@ -1,14 +1,17 @@
|
|
| 1 |
import os
|
| 2 |
|
|
|
|
|
|
|
| 3 |
'''
|
| 4 |
shared environment variables
|
| 5 |
'''
|
| 6 |
-
default_device = "cervical-cage"
|
| 7 |
|
| 8 |
aws_access_key_id = os.environ.get('AMRA_AWS_ACCESS_KEY_ID')
|
| 9 |
aws_secret_access_key = os.environ.get('AMRA_AWS_SECRET_ACCESS_KEY')
|
| 10 |
openai_api_key = os.environ.get('AMRA_OPENAI_API_KEY')
|
| 11 |
|
|
|
|
|
|
|
| 12 |
device_options={
|
| 13 |
"secondary extraction":False,
|
| 14 |
"secondary extraction count":0
|
|
@@ -106,18 +109,27 @@ authors_inst=[
|
|
| 106 |
f"return the results on the same line separated by commas.",
|
| 107 |
]
|
| 108 |
|
| 109 |
-
|
| 110 |
-
f"extract the acceptance
|
| 111 |
-
f"return the results on a single line as 'Accepted
|
| 112 |
]
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
'''
|
| 115 |
application default data
|
| 116 |
'''
|
| 117 |
app_data = {
|
| 118 |
-
"
|
| 119 |
"articles":[],
|
|
|
|
| 120 |
"terms":[],
|
| 121 |
-
"
|
| 122 |
-
# "outputs":[]
|
| 123 |
}
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
|
| 5 |
'''
|
| 6 |
shared environment variables
|
| 7 |
'''
|
|
|
|
| 8 |
|
| 9 |
aws_access_key_id = os.environ.get('AMRA_AWS_ACCESS_KEY_ID')
|
| 10 |
aws_secret_access_key = os.environ.get('AMRA_AWS_SECRET_ACCESS_KEY')
|
| 11 |
openai_api_key = os.environ.get('AMRA_OPENAI_API_KEY')
|
| 12 |
|
| 13 |
+
default_region = "Spine"
|
| 14 |
+
|
| 15 |
device_options={
|
| 16 |
"secondary extraction":False,
|
| 17 |
"secondary extraction count":0
|
|
|
|
| 109 |
f"return the results on the same line separated by commas.",
|
| 110 |
]
|
| 111 |
|
| 112 |
+
accepted_year_inst=[
|
| 113 |
+
f"extract the acceptance year of the article from the system text.",
|
| 114 |
+
f"return the results on a single line as 'Accepted Year: <year>.",
|
| 115 |
]
|
| 116 |
|
| 117 |
+
accepted_month_inst=[
|
| 118 |
+
f"extract the acceptance month of the article from the system text.",
|
| 119 |
+
f"return the results on a single line as 'Accepted Month: <month>.",
|
| 120 |
+
]
|
| 121 |
+
|
| 122 |
+
abstract_inst=[
|
| 123 |
+
f"Extract the abstract of the article from the system text, and return its original text. Normally, the abstract is before the introduction and might a paragraph, or in sections of study design, objective, summary of background, methods, results, and conclusion with keywords section in the end.",
|
| 124 |
+
]
|
| 125 |
+
|
| 126 |
'''
|
| 127 |
application default data
|
| 128 |
'''
|
| 129 |
app_data = {
|
| 130 |
+
"current article":{},
|
| 131 |
"articles":[],
|
| 132 |
+
"prompts":{},
|
| 133 |
"terms":[],
|
| 134 |
+
"paths":{}
|
|
|
|
| 135 |
}
|
cloud_textract.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import boto3
|
| 2 |
|
| 3 |
from utility import terminal_print, create_md_table
|
| 4 |
-
from application import aws_access_key_id, aws_secret_access_key
|
| 5 |
|
| 6 |
textract = boto3.client(
|
| 7 |
'textract',
|
|
@@ -190,7 +190,7 @@ def textract_output_to_table(table,blocks_dict):
|
|
| 190 |
array.append([])
|
| 191 |
cur_row = r_id
|
| 192 |
if "Relationships" in c:
|
| 193 |
-
words = [blocks_dict[i]["Text"] for i in c["Relationships"][0]["Ids"]]
|
| 194 |
else:
|
| 195 |
words =[""]
|
| 196 |
# print(c["RowIndex"],c["ColumnIndex"]," ".join(words))
|
|
@@ -199,7 +199,7 @@ def textract_output_to_table(table,blocks_dict):
|
|
| 199 |
return array
|
| 200 |
|
| 201 |
@terminal_print
|
| 202 |
-
def get_tables(filename:str,bucket:str=
|
| 203 |
'''
|
| 204 |
This function is used to get the tables from the textract output
|
| 205 |
|
|
|
|
| 1 |
import boto3
|
| 2 |
|
| 3 |
from utility import terminal_print, create_md_table
|
| 4 |
+
from application import aws_access_key_id, aws_secret_access_key, default_s3_bucket
|
| 5 |
|
| 6 |
textract = boto3.client(
|
| 7 |
'textract',
|
|
|
|
| 190 |
array.append([])
|
| 191 |
cur_row = r_id
|
| 192 |
if "Relationships" in c:
|
| 193 |
+
words = [blocks_dict[i]["Text"] for i in c["Relationships"][0]["Ids"] if blocks_dict[i]["BlockType"] == "WORD"]
|
| 194 |
else:
|
| 195 |
words =[""]
|
| 196 |
# print(c["RowIndex"],c["ColumnIndex"]," ".join(words))
|
|
|
|
| 199 |
return array
|
| 200 |
|
| 201 |
@terminal_print
|
| 202 |
+
def get_tables(filename:str,bucket:str=default_s3_bucket):
|
| 203 |
'''
|
| 204 |
This function is used to get the tables from the textract output
|
| 205 |
|