Spaces:

amra-ai
/

studies

Runtime error

Roland Ding commited on Nov 20, 2023

Commit

fdccd1a

1 Parent(s): 352e336

Major backend_update

UI update:
+ Revised ui_studies to:
- show all studies articles
- added control for multiple files upload
+ Revised ui_study to:
- update the article list in real time
- realigned the layout to make it more compact

Features:
+ added backend methods to upload multiple files
+ added post_process function to execute Futable commends
+ added retry_decorator for retrying failed openai inquiries
+ touchup for including futable results in extraction list for display
+ changed extraction collections to set
+ move the llm instance to global in chains.py
+ added root message for agent declaration before the first inquiry

Cloud:
+ updated get table for handling pagination.

Others:
+ touchup remove_symbols in utility.py for better filtering article content and characters.

cleanup:
+ removed cloud_textract.py

On branch main
Changes to be committed:
modified: chains.py
modified: cloud_db.py
deleted: cloud_textract.py
modified: features.py
modified: ui_studies.py
modified: ui_study.py
modified: utility.py

Files changed (7) hide show

chains.py +22 -19
cloud_db.py +6 -2
cloud_textract.py +0 -230
features.py +29 -8
ui_studies.py +22 -20
ui_study.py +5 -6
utility.py +24 -281

chains.py CHANGED Viewed

@@ -2,16 +2,23 @@ import asyncio
 import openai
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts.chat import ChatPromptTemplate
 from langchain.schema import BaseOutputParser
 from application import *
 from utility import read_pdf,aterminal_print
-class Replacement(BaseOutputParser):
-    """Parse the output of an LLM call to a comma-separated list."""
     def parse(self, text: str, **kwargs):
         """Parse the output of an LLM call."""
         if kwargs:
@@ -21,24 +28,23 @@ class Replacement(BaseOutputParser):
 @aterminal_print # need to review this.
 async def async_generate(article,name,chain,replacement_term=None):
     if replacement_term:
-        resp = await chain.ainvoke({"term":replacement_term})
     else:
-        resp = await chain.ainvoke({"term":""})
-    article[name] = resp.content
 @aterminal_print # need to review this.
 async def execute_concurrent(article,prompts):
-    llm = ChatOpenAI(
-        temperature=0.0,
-        model_name="gpt-3.5-turbo-16k",
-        openai_api_key=openai.api_key)
     tasks = []
     prompt_type = article["logic"]
     prompt_list = list(prompts.keys())
-    print(prompt_list)
-    # for name,p in prompts.items():
     while prompt_list:
         name = prompt_list.pop(0)
         p = prompts[name]
@@ -49,8 +55,11 @@ async def execute_concurrent(article,prompts):
         print("executing",p["assessment_step"],name)
         input_text = "".join([article[s] for s in p["input_list"]])
         chat_prompt = ChatPromptTemplate.from_messages([
             ("human",input_text),
             ("system",p[prompt_type]),
         ])
@@ -85,16 +94,10 @@ if __name__ == "__main__":
     sample_content,_ = read_pdf(sample_artice)
     llm = ChatOpenAI(temperature=0.0,model_name="gpt-3.5-turbo-16k")
-    # with open(".prompts/other/Need for ICU.txt") as f:
-    #     prompt = f.read()
-    #     name = "Need for ICU"
     with open(".prompts/other/Operation Time.txt") as f:
         prompt = f.read()
         name = "Operation Time"
-    # with open(".prompts/other/Blood Loss.txt") as f:
-    #     prompt = f.read()
-    #     name = "Blood Loss"
     post_prompt_maping = {}
     post_replace_term = lambda res,map=post_prompt_maping:replace_term(res,map=map)

 import openai
 from langchain.chat_models import ChatOpenAI
+from langchain.chat_models.openai import _create_retry_decorator
 from langchain.prompts.chat import ChatPromptTemplate
 from langchain.schema import BaseOutputParser
 from application import *
 from utility import read_pdf,aterminal_print
+llm = ChatOpenAI(
+    temperature=0.0,
+    model_name="gpt-3.5-turbo-16k",
+    openai_api_key=openai.api_key)
+retry_decorator = _create_retry_decorator(llm)
+class Replacement(BaseOutputParser):
+    """Parse the output of an LLM call to a comma-separated list."""
     def parse(self, text: str, **kwargs):
         """Parse the output of an LLM call."""
         if kwargs:
 @aterminal_print # need to review this.
 async def async_generate(article,name,chain,replacement_term=None):
     if replacement_term:
+        res = await chain.ainvoke({"term":replacement_term})
     else:
+        res = await chain.ainvoke({"term":""})
+    print("completed",name)
+    article[name] = res.content
 @aterminal_print # need to review this.
+@retry_decorator
 async def execute_concurrent(article,prompts):
     tasks = []
     prompt_type = article["logic"]
     prompt_list = list(prompts.keys())
+    i = 0
     while prompt_list:
         name = prompt_list.pop(0)
         p = prompts[name]
         print("executing",p["assessment_step"],name)
         input_text = "".join([article[s] for s in p["input_list"]])
+        # with open(f".outputs/{i}_{name}.txt","w+") as f:
+        #     f.write(input_text)
+        #     f.write(p[prompt_type])
         chat_prompt = ChatPromptTemplate.from_messages([
+            ("system","You are a helpful AI that can answer questions about clinical trail and operation studies."),
             ("human",input_text),
             ("system",p[prompt_type]),
         ])
     sample_content,_ = read_pdf(sample_artice)
     llm = ChatOpenAI(temperature=0.0,model_name="gpt-3.5-turbo-16k")
     with open(".prompts/other/Operation Time.txt") as f:
         prompt = f.read()
         name = "Operation Time"
     post_prompt_maping = {}
     post_replace_term = lambda res,map=post_prompt_maping:replace_term(res,map=map)

cloud_db.py CHANGED Viewed

@@ -17,8 +17,12 @@ dynamodb data operations
 # get the list of articles from articles table in dynamodb
 @terminal_print
 def get_table(table_name:str):
-    result = db_client.scan(TableName = table_name)#,AttributesToGet = data_structure[table_name]["fields"])
-    return [db_map_to_py_dict(r) for r in result["Items"]]
 # add a new article to table articles in dynamodb, return error if failed
 def post_item(table_name:str,item:dict):

 # get the list of articles from articles table in dynamodb
 @terminal_print
 def get_table(table_name:str):
+    result = db_client.scan(TableName = table_name)
+    items = result["Items"]
+    while "LastEvaluatedKey" in result:
+        result = db_client.scan(TableName = table_name,ExclusiveStartKey = result["LastEvaluatedKey"])
+        items.extend(result["Items"])
+    return [db_map_to_py_dict(r) for r in items]
 # add a new article to table articles in dynamodb, return error if failed
 def post_item(table_name:str,item:dict):

cloud_textract.py DELETED Viewed

@@ -1,230 +0,0 @@
-import boto3
-from utility import terminal_print, create_md_table
-from application import aws_access_key_id, aws_secret_access_key, default_s3_bucket
-textract = boto3.client(
-    'textract',
-    aws_access_key_id=aws_access_key_id,
-    aws_secret_access_key=aws_secret_access_key,
-    region_name='us-east-1')
-@terminal_print
-def textract_get_tables(res_tables,textract=textract):
-    '''
-    This function is used to get the tables from the textract output
-    Parameters:
-    res_tables: the output from the textract.get_document_analysis function
-    textract: the boto3 client for textract
-    Returns:
-    result: the cascaded output with blocks from the textract.get_document_analysis function
-    '''
-    job_id = res_tables["JobId"]
-    temp = result = res_tables.copy()
-    while "NextToken" in temp:
-        temp = textract.get_document_analysis(JobId=job_id,NextToken=temp["NextToken"])
-        result["Blocks"].extend(temp["Blocks"])
-    return result
-@terminal_print
-def textract_get_text(res_text,textract=textract):
-    '''
-    This function is used to get the text from the textract output
-    Parameters:
-    res_text: the output from the textract.get_document_text_detection function
-    textract: the boto3 client for textract
-    Returns:
-    result: the cascaded output with blocks from the textract.get_document_text_detection function
-    '''
-    job_id = res_text["JobId"]
-    temp = result = res_text.copy()
-    while "NextToken" in temp:
-        temp = textract.get_document_text_detection(JobId=job_id,NextToken=temp["NextToken"])
-        result["Blocks"].extend(temp["Blocks"])
-    return result
-@terminal_print
-def get_article_tables(file_name:str,bucket:str,delay:int=5):
-    '''
-    This function is used to get the tables from the textract output
-    Parameters:
-    file_name: the name of the file in the bucket
-    bucket: the name of the bucket
-    delay: the delay time for the textract.get_document_analysis function
-    Returns:
-    res_tables: the output from the textract.get_document_analysis function with initial blocks
-    '''
-    import time
-    # need to use async method to process the files
-    job_tables = textract.start_document_analysis(
-        DocumentLocation={
-            "S3Object":{
-                "Bucket":bucket,
-                "Name": file_name
-                }
-            },
-        FeatureTypes=["TABLES"]
-        )
-    table_job_id = job_tables["JobId"]
-    res_tables = {"JobStatus":"IN_PROGRESS"}
-    while res_tables["JobStatus"] == "IN_PROGRESS":
-        time.sleep(delay)
-        res_tables = textract.get_document_analysis(JobId=table_job_id)
-    res_tables["JobId"] = table_job_id
-    return res_tables
-@terminal_print
-def get_article_text(file_name:str,bucket:str,delay:int=5):
-    '''
-    This function is used to get the text from the textract output
-    Parameters:
-    file_name: the name of the file in the bucket
-    bucket: the name of the bucket
-    delay: the delay time for the textract.get_document_text_detection function
-    Returns:
-    res_text: the output from the textract.get_document_text_detection function with initial blocks
-    '''
-    import time
-    job_text = textract.start_document_text_detection(
-        DocumentLocation={
-            "S3Object":{
-                "Bucket":bucket,
-                "Name": file_name
-                }
-            }
-        )
-    text_job_id = job_text["JobId"]
-    res_text = {"JobStatus":"IN_PROGRESS"}
-    while res_text["JobStatus"] == "IN_PROGRESS":
-        time.sleep(delay)
-        if res_text["JobStatus"] == "IN_PROGRESS":
-            res_text = textract.get_document_text_detection(JobId=text_job_id)
-    res_text["JobId"] = text_job_id
-    return res_text
-@terminal_print
-def construct_tables(tables):
-    '''
-    This function is used to construct the tables from the textract output
-    Parameters:
-    tables: the output from the textract.get_document_analysis function
-    Returns:
-    table_blocks: the list of tables with the blocks
-    blocks_dict: the dictionary of blocks with the block id as the key
-    '''
-    blocks = tables["Blocks"]
-    blocks_dict = {}
-    table_blocks = []
-    for b in blocks:
-        blocks_dict[b["Id"]] = b
-        if b["BlockType"] == "TABLE":
-            temp = {
-                "id":b["Id"],
-                "relationship":b["Relationships"],
-                "confidence":b["Confidence"],
-                "page":b["Page"],
-                "map":{}
-            }
-            table_blocks.append(temp)
-    for t in table_blocks:
-        for e in t["relationship"]:
-            t["map"].update({id:{"Type":e["Type"]} for id in e["Ids"]})
-        for id in t["map"]:
-            component = blocks_dict[id]
-            if component["BlockType"] not in t:
-                t[component["BlockType"]] = []
-            t[component["BlockType"]].append(component)
-        # table_blocks.append(t)
-    return table_blocks, blocks_dict
-# Transfer the table blocks from aws textract into a table
-@terminal_print
-def textract_output_to_table(table,blocks_dict):
-    '''
-    This function is used to transfer the table blocks from aws textract into a table
-    Parameters:
-    table: the table block from the textract output
-    blocks_dict: the dictionary of blocks with the block id as the key
-    Returns:
-    array: the table array with the text from the table blocks
-    '''
-    array = [[]]
-    cur_row = 1
-    for c in table["CELL"]:
-        r_id = c["RowIndex"]
-        if r_id > cur_row:
-            array.append([])
-            cur_row = r_id
-        if "Relationships" in c:
-            words = [blocks_dict[i]["Text"] for i in  c["Relationships"][0]["Ids"] if blocks_dict[i]["BlockType"] == "WORD"]
-        else:
-            words =[""]
-        # print(c["RowIndex"],c["ColumnIndex"]," ".join(words))
-        array[-1].append(" ".join(words))
-    return array
-@terminal_print
-def get_tables(filename:str,bucket:str=default_s3_bucket):
-    '''
-    This function is used to get the tables from the textract output
-    Parameters:
-    filename: the name of the file in the bucket
-    bucket: the name of the bucket
-    Returns:
-    md_tables: the list of tables in markdown format
-    '''
-    tables_temp = get_article_tables(file_name=filename,bucket=bucket)
-    tables = textract_get_tables(tables_temp)
-    table_blocks,block_dict = construct_tables(tables)
-    md_tables = []
-    # review table and exclude the reference table if any
-    for table in table_blocks:
-        table_array = textract_output_to_table(table,block_dict)
-        md_tables.append(create_md_table(table_array))
-    return md_tables
-def is_reference_table(table):
-    return

features.py CHANGED Viewed

@@ -69,10 +69,7 @@ def process_study( # need revision
         ):
     if study_file_obj:
-        if type(study_file_obj) is list:
-            article = add_article(domain,study_file_obj[0])
-        else:
-            article = add_article(domain,study_file_obj)
     elif study_content:
         article = add_article(domain,study_content,file_object=False)
     else:
@@ -89,6 +86,7 @@ def process_study( # need revision
     # set the current article to the completed article object
     app_data["current_article"] = article
     # update the article to the cloud
     try:
@@ -103,6 +101,29 @@ def process_study( # need revision
     return overview, detail_views
 @terminal_print
 def update_article_segment(article):
     # get the key content between article objective and discussion
@@ -502,8 +523,8 @@ def select_performance_prompts(article,performance_assessment):
                 else:
                     valid_prompts[p]["term"].update({t["term"]:t})
                 if performance_assessment not in article["extraction"]:
-                    article["extraction"][performance_assessment] = []
-                article["extraction"][performance_assessment].append(prompt["prompt_name"])
     return valid_prompts
@@ -633,7 +654,7 @@ def run_executor(article,prompt):
         case "f_summary_term":
             f_summary_term(article,prompt)
 @terminal_print
 def post_process(article):
     post_inputs = {}
@@ -657,7 +678,7 @@ def post_process(article):
     for assessment,post_input in post_inputs.items():
         instruction_agg = app_data["prompts_agg"][assessment]
         article[instruction_agg["name"]] = chain.invoke({"text":post_input,"instruction":instruction_agg["chain"][0]}).content
-        article["extraction"][assessment].append(instruction_agg["name"])
 def add_inst(instructions,prompt):

         ):
     if study_file_obj:
+        article = add_article(domain,study_file_obj)
     elif study_content:
         article = add_article(domain,study_content,file_object=False)
     else:
     # set the current article to the completed article object
     app_data["current_article"] = article
+    app_data["articles"][article["name"]] = article
     # update the article to the cloud
     try:
     return overview, detail_views
+@terminal_print
+def process_studies(
+    domain,
+    file_objs):
+    for file_obj in file_objs:
+        process_study(domain,file_obj,None)
+    return gr.update(value=create_md_tables(app_data["articles"]))
+@terminal_print
+def create_md_tables(articles):
+    '''
+    create markdown tables for the articles.
+    '''
+    md_text = ""
+    md_text += "| Article Name | Authors | Domain | Upload Time |\n| --- | --- | --- | --- |\n"
+    for name, article in articles.items():
+        md_table = f"| {name} | {article['Authors']} |{article['domain']} | {article['upload_time']} | \n"
+        md_text += md_table
+    return md_text
 @terminal_print
 def update_article_segment(article):
     # get the key content between article objective and discussion
                 else:
                     valid_prompts[p]["term"].update({t["term"]:t})
                 if performance_assessment not in article["extraction"]:
+                    article["extraction"][performance_assessment] = set()
+                article["extraction"][performance_assessment].add(prompt["prompt_name"])
     return valid_prompts
         case "f_summary_term":
             f_summary_term(article,prompt)
+@retry_decorator
 @terminal_print
 def post_process(article):
     post_inputs = {}
     for assessment,post_input in post_inputs.items():
         instruction_agg = app_data["prompts_agg"][assessment]
         article[instruction_agg["name"]] = chain.invoke({"text":post_input,"instruction":instruction_agg["chain"][0]}).content
+        article["extraction"][assessment].add(instruction_agg["name"])
 def add_inst(instructions,prompt):

ui_studies.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 from application import *
-from features import init_app_data
 from utility import terminal_print
 def refresh():
@@ -11,29 +11,31 @@ def refresh():
     '''
     return create_md_tables(app_data["articles"])
-def create_md_tables(articles):
-    '''
-    create markdown tables for the articles.
-    '''
-    md_text = ""
-    md_text += "| Domain | File Name | Upload Time | Device |\n| --- | --- | --- | --- |\n"
-    for article in articles:
-        md_table = f"| {article['domain']} | {article['name']} | {article['upload_time']} | {default_region} |\n"
-        md_text += md_table
-    return md_text
 @terminal_print
 def init_studies_page():
     with gr.Blocks() as studies_page:
-        with gr.Row():
-            gr.Markdown("## Article Lists")
-            btn_refresh = gr.Button(value="Refresh",variant="primary")
-        gr.HTML("<hr>")
-        article_list = gr.Markdown("")
         btn_refresh.click(
             fn=refresh,

 import gradio as gr
 from application import *
+from features import init_app_data,process_studies,create_md_tables
 from utility import terminal_print
 def refresh():
     '''
     return create_md_tables(app_data["articles"])
 @terminal_print
 def init_studies_page():
     with gr.Blocks() as studies_page:
+        with gr.Row(equal_height=False):
+            with gr.Column():
+                gr.Markdown("## Clinical Studies")
+                domain = gr.Radio(label="Anatomical Region",choices=anatomic_domains,value=default_region)
+                upload_studies = gr.File(label="Upload clinical study reports",type="file",file_count="multiple")
+                btn_upload_studies = gr.Button(value="Upload",variant="primary")
+            with gr.Column():
+                gr.Markdown("## Article Lists")
+                btn_refresh = gr.Button(value="Refresh",variant="primary")
+                gr.HTML("<hr>")
+                article_list = gr.Markdown("")
+        btn_upload_studies.click(
+            process_studies,
+            inputs=[
+                domain,
+                upload_studies,
+            ],
+            outputs=[
+                article_list,
+            ],
+        )
         btn_refresh.click(
             fn=refresh,

ui_study.py CHANGED Viewed

@@ -25,16 +25,15 @@ def init_study_page():
             with gr.Column():
                 gr.Markdown("## Studies")
                 gr.HTML("<hr>")
-                upload_study = gr.File(label="Upload a clinical study report",type="file",file_count="multiple")
-            with gr.Column():
                 domain = gr.Radio(label="Anatomical Region",choices=anatomic_domains,value=default_region)
-                input_study = gr.TextArea(label="Or paste a clinical study report content",placeholder="Paste content here...",lines=5)
                 with gr.Row():
                     btn_reset = gr.Button(value="Reset",variant="stop")
                     btn_add_study = gr.Button(value="Add",variant="primary")
         gr.HTML("<hr>")
         with gr.Row():

             with gr.Column():
                 gr.Markdown("## Studies")
                 gr.HTML("<hr>")
                 domain = gr.Radio(label="Anatomical Region",choices=anatomic_domains,value=default_region)
+                upload_study = gr.File(label="Upload a clinical study report",type="file",file_count="single")
                 with gr.Row():
                     btn_reset = gr.Button(value="Reset",variant="stop")
                     btn_add_study = gr.Button(value="Add",variant="primary")
+            with gr.Column():
+                input_study = gr.TextArea(label="Or paste a clinical study report content",placeholder="Paste content here...",lines=5)
         gr.HTML("<hr>")
         with gr.Row():

utility.py CHANGED Viewed

@@ -50,25 +50,11 @@ def terminal_print(func):
 following functions are for file manipulation
 '''
-@terminal_print
 @terminal_print
 def read_pdf(file_path):
     '''
     this function read the pdf file and return the text
-    Parameters
-    ----------
-    file_path : str
-        path to the pdf file
-    Returns
-    -------
-    text : str
-        text extracted from the pdf file
-    '''
-    '''
-    this function read the pdf file and return the text
     Parameters
     ----------
     file_path : str
@@ -83,11 +69,6 @@ def read_pdf(file_path):
     if type(file_path) is str:
         file_obj = open(file_path, 'rb')
     # elif type(file_path) is tempfile._TemporaryFileWrapper:
-    else:
-        file_obj = open(file_path.name, 'rb')
-    if type(file_path) is str:
-        file_obj = open(file_path, 'rb')
-    # elif type(file_path) is tempfile._TemporaryFileWrapper:
     else:
         file_obj = open(file_path.name, 'rb')
@@ -98,22 +79,11 @@ def read_pdf(file_path):
     parser = PDFParser(file_obj)
     doc = PDFDocument(parser)
-    meta = doc.info
-    text = extract_text(file_obj)
-    text = remove_symbols(text)
-    text = remove_citation(text)
-    parser = PDFParser(file_obj)
-    doc = PDFDocument(parser)
     meta = doc.info
     # close the pdf file object
     file_obj.close()
     return text, meta
-    file_obj.close()
-    return text, meta
 '''
 following functions are for format standard response
@@ -136,22 +106,7 @@ def format_response(code,data):
     dict
         formatted response
     '''
-    '''
-    this function format the response to be returned to the client.
-    this is used for lambda serverless framework to return the response.
-    Parameters
-    ----------
-    code : int
-        status code
-    data : dict
-        data to be returned to the client
-    Returns
-    -------
-    dict
-        formatted response
-    '''
     return {
         "statusCode":code,
         "headers":{
@@ -171,19 +126,6 @@ def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]):
     '''
     this function format the text output by removing excessive characters
-    Parameters
-    ----------
-    text : str
-        text to be processed
-    Returns
-    -------
-    str
-        processed text
-    '''
-    '''
-    this function format the text output by removing excessive characters
     Parameters
     ----------
     text : str
@@ -199,7 +141,6 @@ def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]):
     return text
-@terminal_print
 @terminal_print
 def remove_symbols(text):
     '''
@@ -215,20 +156,8 @@ def remove_symbols(text):
     str
         processed text
     '''
-    '''
-    this function remove symbols that are not in unicode
-    Parameters
-    ----------
-    text : str
-        text to be processed
-    Returns
-    -------
-    str
-        processed text
-    '''
-    text = text.encode("ascii", "ignore").decode()
     text = text.replace('-\n', '')
     return text
@@ -249,42 +178,11 @@ def remove_citation(text):
     '''
     return re.sub(r'\(cid:\d+\)','',text)
-@terminal_print
-@terminal_print
-def remove_citation(text):
-    '''
-    this function remove citation pattern in the text
-    Parameters
-    ----------
-    text : str
-        text to be processed
-    Returns
-    -------
-    str
-        processed text
-    '''
-    return re.sub(r'\(cid:\d+\)','',text)
 @terminal_print
 def str_to_tuple(s):
     '''
     this function convert string to tuple
-    Parameters
-    ----------
-    s : str
-        string to be converted
-    Returns
-    -------
-    tuple
-        converted tuple
-    '''
-    '''
-    this function convert string to tuple
     Parameters
     ----------
     s : str
@@ -312,53 +210,28 @@ def replace_symbols(s):
     str
         replaced string
     '''
-    s = s.replace(" ","_")
-    s = s.replace(",","")
-    s = s.replace(".","")
-    s = s.replace("-","_")
-    s = s.replace("(","")
-    s = s.replace(")","")
-    s = s.replace("/","_")
-    s = s.replace(":","")
-    s = s.replace(";","")
-    s = s.replace("'","")
-    s = s.replace('"',"")
-    return s
-@terminal_print
-def replace_symbols(s):
-    '''
-    this function replace symbols in the string to comply with file names
-    Parameters
-    ----------
-    s : str
-        string to be replaced
-    Returns
-    -------
-    str
-        replaced string
-    '''
-    s = s.replace(" ","_")
-    s = s.replace(",","")
-    s = s.replace(".","")
-    s = s.replace("-","_")
-    s = s.replace("(","")
-    s = s.replace(")","")
-    s = s.replace("/","_")
-    s = s.replace(":","")
-    s = s.replace(";","")
-    s = s.replace("'","")
-    s = s.replace('"',"")
     return s
 '''
 following functions are for dynamodb data manipulation
 '''
-# @terminal_print
 # @terminal_print
 def db_map_to_py_dict(db_map):
     '''
@@ -374,19 +247,7 @@ def db_map_to_py_dict(db_map):
     dict
         python dictionary
     '''
-    '''
-    this function convert dynamodb map data structure to python dictionary
-    Parameters
-    ----------
-    db_map : dict
-        dynamodb map
-    Returns
-    -------
-    dict
-        python dictionary
-    '''
     py_dict = {}
     for k,i in db_map.items():
         for l,v in i.items():
@@ -404,36 +265,16 @@ def db_map_to_py_dict(db_map):
                 py_dict[k] = v
             elif l =="NULL":
                 py_dict[k] = None
-            elif l == "BS":
-                py_dict[k] = v
-            elif l == "BOOL":
-                py_dict[k] = v
-            elif l =="NULL":
-                py_dict[k] = None
             else:
                 py_dict[k] = v
     return py_dict
-# @terminal_print
 # @terminal_print
 def py_dict_to_db_map(py_dict):
     '''
     this function convert python dictionary to dynamodb map data structure
-    Parameters
-    ----------
-    py_dict : dict
-        python dictionary
-    Returns
-    -------
-    dict
-        dynamodb map
-    '''
-    '''
-    this function convert python dictionary to dynamodb map data structure
     Parameters
     ----------
     py_dict : dict
@@ -464,35 +305,13 @@ def py_dict_to_db_map(py_dict):
             db_map[key] = {"NULL":True}
         elif type(value) is set:
             db_map[key] = {"L":py_list_to_db_list(value)}
-        elif type(value) is bytes:
-            db_map[key] = {"B":value}
-        elif type(value) is bool:
-            db_map[key] = {"BOOL":value}
-        elif value is None:
-            db_map[key] = {"NULL":True}
-        elif type(value) is set:
-            db_map[key] = {"L":py_list_to_db_list(value)}
     return db_map
-# @terminal_print
 # @terminal_print
 def db_list_to_py_list(db_list):
     '''
     this function convert dynamodb list data structure to python list
-    Parameters
-    ----------
-    db_list : list
-        dynamodb list
-    Returns
-    -------
-    list
-        python list
-    '''
-    '''
-    this function convert dynamodb list data structure to python list
     Parameters
     ----------
     db_list : list
@@ -517,43 +336,20 @@ def db_list_to_py_list(db_list):
                     py_list.append(int(v))
             elif t =="S" or t =="BOOL" or t =="SS" or t =="NS":
                 py_list.append(v)
-            elif t =="N":
-                if "." in v:
-                    py_list.append(float(v))
-                else:
-                    py_list.append(int(v))
-            elif t =="S" or t =="BOOL" or t =="SS" or t =="NS":
-                py_list.append(v)
             elif t =="B" or t =="BS":
                 py_list.append(bytes(v,"utf-8"))
             elif t =="NULL":
                 py_list.append(None)
-            elif t =="BOOL":
-                py_list.append(bool(v))
             else:
                 py_list.append(db_map_to_py_dict(v))
     return py_list
-# @terminal_print
 # @terminal_print
 def py_list_to_db_list(py_list):
     '''
     this function convert python list to dynamodb list data structure
-    Parameters
-    ----------
-    py_list : list
-        python list
-    Returns
-    -------
-    list
-        dynamodb list
-    '''
-    '''
-    this function convert python list to dynamodb list data structure
     Parameters
     ----------
     py_list : list
@@ -570,12 +366,8 @@ def py_list_to_db_list(py_list):
             item = {"S":value}
         elif type(value) is int or type(value) is float:
             item = {"N":str(value)}
-        elif type(value) is int or type(value) is float:
-            item = {"N":str(value)}
         elif type(value) is dict:
             item = {"M":py_dict_to_db_map(value)}
-            # item = py_dict_to_db_map(value)
-            # item = py_dict_to_db_map(value)
         elif type(value) is list:
             item = {"L":py_list_to_db_list(value)}
         elif type(value) is tuple:
@@ -588,66 +380,17 @@ def py_list_to_db_list(py_list):
             item = {"NULL":True}
         elif type(value) is set:
             item = {"L":py_list_to_db_list(value)}
-        elif type(value) is tuple:
-            item = {"L":py_list_to_db_list(value)}
-        elif type(value) is bytes:
-            item = {"B":value}
-        elif type(value) is bool:
-            item = {"BOOL":value}
-        elif value is None:
-            item = {"NULL":True}
         db_list.append(item)
     return db_list
 def list_dict_to_dict(ls,key):
-    result_dict = {}
-    for d in ls:
-        if key in d:
-            result_dict[d[key]] = d
-    return result_dict
-'''
-following functions are for markdown table creation
-'''
-@terminal_print
-def create_md_table(array):
-    '''
-    create markdown tables for an array.
-    Parameters
-    ----------
-    array: list
-        a table in the form of a list of lists
-    Returns
-    -------
-        md_table: str
-    '''
-    md_table = ""
-    for i,row in enumerate(array):
-        md_row = ""
-        for item in row:
-            md_item = f"| {item} "
-            md_row += md_item
-        md_row += "|\n"
-        md_table += md_row
-        if i == 0:
-            md_table += f"| {' | '.join(['---' for _ in range(len(row))])} |\n"
-    return md_table
-def list_dict_to_dict(ls,key):
-    result_dict = {}
-    for d in ls:
-        if key in d:
-            result_dict[d[key]] = d
-    return result_dict
 '''
 following functions are for markdown table creation

 following functions are for file manipulation
 '''
 @terminal_print
 def read_pdf(file_path):
     '''
     this function read the pdf file and return the text
     Parameters
     ----------
     file_path : str
     if type(file_path) is str:
         file_obj = open(file_path, 'rb')
     # elif type(file_path) is tempfile._TemporaryFileWrapper:
     else:
         file_obj = open(file_path.name, 'rb')
     parser = PDFParser(file_obj)
     doc = PDFDocument(parser)
     meta = doc.info
     # close the pdf file object
     file_obj.close()
     return text, meta
 '''
 following functions are for format standard response
     dict
         formatted response
     '''
     return {
         "statusCode":code,
         "headers":{
     '''
     this function format the text output by removing excessive characters
     Parameters
     ----------
     text : str
     return text
 @terminal_print
 def remove_symbols(text):
     '''
     str
         processed text
     '''
+    import re
+    text = re.sub(r"[^a-zA-Z0-9\n\r]+", ' ', text)
     text = text.replace('-\n', '')
     return text
     '''
     return re.sub(r'\(cid:\d+\)','',text)
 @terminal_print
 def str_to_tuple(s):
     '''
     this function convert string to tuple
     Parameters
     ----------
     s : str
     str
         replaced string
     '''
+    symbols_map = {
+        " ":"_",
+        ",":"",
+        ".":"",
+        "-":"_",
+        "(":"",
+        ")":"",
+        "/":"_",
+        ":":"",
+        ";":"",
+        "'":"",
+        '"':""
+    }
+    for symbol in symbols_map:
+        s = s.replace(symbol,symbols_map[symbol])
     return s
 '''
 following functions are for dynamodb data manipulation
 '''
 # @terminal_print
 def db_map_to_py_dict(db_map):
     '''
     dict
         python dictionary
     '''
     py_dict = {}
     for k,i in db_map.items():
         for l,v in i.items():
                 py_dict[k] = v
             elif l =="NULL":
                 py_dict[k] = None
             else:
                 py_dict[k] = v
     return py_dict
 # @terminal_print
 def py_dict_to_db_map(py_dict):
     '''
     this function convert python dictionary to dynamodb map data structure
     Parameters
     ----------
     py_dict : dict
             db_map[key] = {"NULL":True}
         elif type(value) is set:
             db_map[key] = {"L":py_list_to_db_list(value)}
     return db_map
 # @terminal_print
 def db_list_to_py_list(db_list):
     '''
     this function convert dynamodb list data structure to python list
     Parameters
     ----------
     db_list : list
                     py_list.append(int(v))
             elif t =="S" or t =="BOOL" or t =="SS" or t =="NS":
                 py_list.append(v)
             elif t =="B" or t =="BS":
                 py_list.append(bytes(v,"utf-8"))
             elif t =="NULL":
                 py_list.append(None)
             else:
                 py_list.append(db_map_to_py_dict(v))
     return py_list
 # @terminal_print
 def py_list_to_db_list(py_list):
     '''
     this function convert python list to dynamodb list data structure
     Parameters
     ----------
     py_list : list
             item = {"S":value}
         elif type(value) is int or type(value) is float:
             item = {"N":str(value)}
         elif type(value) is dict:
             item = {"M":py_dict_to_db_map(value)}
         elif type(value) is list:
             item = {"L":py_list_to_db_list(value)}
         elif type(value) is tuple:
             item = {"NULL":True}
         elif type(value) is set:
             item = {"L":py_list_to_db_list(value)}
         db_list.append(item)
     return db_list
 def list_dict_to_dict(ls,key):
+    if all([key in d for d in ls]):
+        return {d[key]:d for d in ls}
+    else:
+        print("key not found in all dictionaries")
+        return {}
 '''
 following functions are for markdown table creation