Spaces:

amra-ai
/

devices

Sleeping

File size: 6,476 Bytes

import json

from application import *

import tiktoken
from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

token_encoder = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    '''
    this function count the number of tokens in the text
    
    Parameters
    ----------
    text: str
        text to be counted
    
    Returns
    -------
    n_tokens: int
        number of tokens in the text
    '''
    
    return len(token_encoder.encode(text))

'''
following functions are for file manipulation
'''

keyword_search = lambda kw, text: kw.lower() in text.lower()
list_or = lambda l: sum(l)>0
list_and = lambda l: sum(l)==len(l)

# read pdf file and return text
def read_pdf(file_path):
    '''
    this function read the pdf file and return the text
    
    Parameters
    ----------'''
    # open the pdf file
    if type(file_path) is str:
        filename = file_path
        pdfFileObj = open(file_path, 'rb')    
    # elif type(file_path) is tempfile._TemporaryFileWrapper:
    else:
        filename = file_path.name
        pdfFileObj = open(file_path.name, 'rb')

    # # create a pdf reader object
    # pdfReader = PyPDF2.PdfReader(pdfFileObj)

    # # get the number of pages in the pdf file
    # num_pages = len(pdfReader.pages)

    # # create an empty string
    # text = ''

    # # iterate through all the pages
    # for page_num in range(num_pages):
    #     page_obj = pdfReader.pages[page_num]
    #     text += page_obj.extract_text ()

    text = extract_text(pdfFileObj)
    text = remove_symbols(text)

    # meta = pdfReader.metadata

    parser = PDFParser(pdfFileObj)
    doc = PDFDocument(parser)

    meta = doc.info
    # close the pdf file object
    pdfFileObj.close()    

    return text, meta

'''
following functions are for format standard response
'''

# format standard response for status code and data
def format_response(code,data):
    return {
        "statusCode":code,
        "headers":{
            "Access-Control-Allow-Origin": "*",
            "Content-Type": "application/json"
        },
        "body":json.dumps(data),
        "isBase64Encoded": False
    }

'''
following functions are for string manipulation
'''

# format text output by removing excessive characters
def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]):
    for c in remove_char_ls:
        text = text.replace(c,"")
    
    return text

# function to remove symbols that are not in unicode
def remove_symbols(text):
    # remove symbols that are not in unicode
    text = text.encode("ascii", "ignore").decode()
    # remove the break word new line return
    text = text.replace('-\n', '')
    return text

def str_to_tuple(s):
    return tuple(s.replace("(","").replace(")","").split(","))

'''
following functions are for dynamodb data manipulation
'''
# convert dynamodb map to python dictionary
def db_map_to_py_dict(db_map):
    py_dict = {}
    for k,i in db_map.items():
        for l,v in i.items():
            if l == "M":
                py_dict[k] = db_map_to_py_dict(v)
            elif l == "S": 
                py_dict[k] = v
            elif l == "N":
                py_dict[k] = int(v) if float(v)%1 ==0 else float(v)
            elif l == "L":
                py_dict[k] = db_list_to_py_list(v)
            elif l == "BS":
                py_dict[k] = v
            elif l == "BOOL":
                py_dict[k] = v
            elif l =="NULL":
                py_dict[k] = None
            else:
                py_dict[k] = v
        
    return py_dict

# convert python dictionary to dynamodb map
def py_dict_to_db_map(py_dict):
    db_map = {}
    for key,value in py_dict.items():
        key = str(key)
        if type(value) is str:
            db_map[key] = {"S":value}
        elif type(value) is int or type(value) is float:
            db_map[key] = {"N":value}
        elif type(value) is dict:
            db_map[key] = {"M":py_dict_to_db_map(value)}
        elif type(value) is list:
            db_map[key] = {"L":py_list_to_db_list(value)}
        elif type(value) is bytes:
            db_map[key] = {"BS":value}
        elif type(value) is bool:
            db_map[key] = {"BOOL":value}
        elif value is None:
            db_map[key] = {"NULL":True}
    return db_map

# convert dynamodb list to python list
def db_list_to_py_list(db_list):
    py_list = []
    for d in db_list:
        for t,v in d.items():
            if t == "M":
                py_list.append(db_map_to_py_dict(v))
            elif t == "L":
                py_list.append(db_list_to_py_list(v))
            elif t =="N" or t =="S" or t =="B" or t =="BOOL" or t =="NULL" or t =="SS" or t =="NS" or t =="BS":
                py_list.append(v)
            else:
                py_list.append(db_map_to_py_dict(v))
                
    return py_list

# convert python list to dynamodb list
def py_list_to_db_list(py_list):
    db_list = []
    for value in py_list:
        if type(value) is str:
            item = {"S":value}
        elif type(value) is int or type(value) is float:
            item = {"N":value}
        elif type(value) is dict:
            item = {"M":py_dict_to_db_map(value)}
            # item = py_dict_to_db_map(value)
        elif type(value) is list:
            item = {"L":py_list_to_db_list(value)}
        elif type(value) is tuple:
            item = {"L":py_list_to_db_list(value)}
        elif type(value) is bytes:
            item = {"BS":value}
        elif type(value) is bool:
            item = {"BOOL":value}
        elif value is None:
            item = {"NULL":True}
        
        db_list.append(item)
        
    return db_list

'''
following functions are used for business logic. (to be moved to business logic layer)
'''

# function to calculate the estimated cost of the translation
def est_cost(n_tokens,rate:float=0.004):
    return round(rate*n_tokens/1000,4)

def create_md_tables(devices):
    '''
    create markdown tables for the articles.

    Parameters
    ----------
    devices: list
        list of devices

    Returns
    -------
        md_text: str
    '''
    md_text = ""
    md_text += "| Device name | Device Type | Intended Use | \n| --- | --- | --- | \n"

    for device in devices:
        md_table = f"| {device['device_name']} | {device['device_type']} | {device['intended_use']} | \n"
        md_text += md_table

    return md_text