import json from application import * import tiktoken from pdfminer.high_level import extract_text from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument token_encoder = tiktoken.get_encoding("cl100k_base") def count_tokens(text): ''' this function count the number of tokens in the text Parameters ---------- text: str text to be counted Returns ------- n_tokens: int number of tokens in the text ''' return len(token_encoder.encode(text)) ''' following functions are for file manipulation ''' keyword_search = lambda kw, text: kw.lower() in text.lower() list_or = lambda l: sum(l)>0 list_and = lambda l: sum(l)==len(l) # read pdf file and return text def read_pdf(file_path): ''' this function read the pdf file and return the text Parameters ----------''' # open the pdf file if type(file_path) is str: filename = file_path pdfFileObj = open(file_path, 'rb') # elif type(file_path) is tempfile._TemporaryFileWrapper: else: filename = file_path.name pdfFileObj = open(file_path.name, 'rb') # # create a pdf reader object # pdfReader = PyPDF2.PdfReader(pdfFileObj) # # get the number of pages in the pdf file # num_pages = len(pdfReader.pages) # # create an empty string # text = '' # # iterate through all the pages # for page_num in range(num_pages): # page_obj = pdfReader.pages[page_num] # text += page_obj.extract_text () text = extract_text(pdfFileObj) text = remove_symbols(text) # meta = pdfReader.metadata parser = PDFParser(pdfFileObj) doc = PDFDocument(parser) meta = doc.info # close the pdf file object pdfFileObj.close() return text, meta ''' following functions are for format standard response ''' # format standard response for status code and data def format_response(code,data): return { "statusCode":code, "headers":{ "Access-Control-Allow-Origin": "*", "Content-Type": "application/json" }, "body":json.dumps(data), "isBase64Encoded": False } ''' following functions are for string manipulation ''' # format text output by removing excessive characters def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]): for c in remove_char_ls: text = text.replace(c,"") return text # function to remove symbols that are not in unicode def remove_symbols(text): # remove symbols that are not in unicode text = text.encode("ascii", "ignore").decode() # remove the break word new line return text = text.replace('-\n', '') return text def str_to_tuple(s): return tuple(s.replace("(","").replace(")","").split(",")) ''' following functions are for dynamodb data manipulation ''' # convert dynamodb map to python dictionary def db_map_to_py_dict(db_map): py_dict = {} for k,i in db_map.items(): for l,v in i.items(): if l == "M": py_dict[k] = db_map_to_py_dict(v) elif l == "S": py_dict[k] = v elif l == "N": py_dict[k] = int(v) if float(v)%1 ==0 else float(v) elif l == "L": py_dict[k] = db_list_to_py_list(v) elif l == "BS": py_dict[k] = v elif l == "BOOL": py_dict[k] = v elif l =="NULL": py_dict[k] = None else: py_dict[k] = v return py_dict # convert python dictionary to dynamodb map def py_dict_to_db_map(py_dict): db_map = {} for key,value in py_dict.items(): key = str(key) if type(value) is str: db_map[key] = {"S":value} elif type(value) is int or type(value) is float: db_map[key] = {"N":value} elif type(value) is dict: db_map[key] = {"M":py_dict_to_db_map(value)} elif type(value) is list: db_map[key] = {"L":py_list_to_db_list(value)} elif type(value) is bytes: db_map[key] = {"BS":value} elif type(value) is bool: db_map[key] = {"BOOL":value} elif value is None: db_map[key] = {"NULL":True} return db_map # convert dynamodb list to python list def db_list_to_py_list(db_list): py_list = [] for d in db_list: for t,v in d.items(): if t == "M": py_list.append(db_map_to_py_dict(v)) elif t == "L": py_list.append(db_list_to_py_list(v)) elif t =="N" or t =="S" or t =="B" or t =="BOOL" or t =="NULL" or t =="SS" or t =="NS" or t =="BS": py_list.append(v) else: py_list.append(db_map_to_py_dict(v)) return py_list # convert python list to dynamodb list def py_list_to_db_list(py_list): db_list = [] for value in py_list: if type(value) is str: item = {"S":value} elif type(value) is int or type(value) is float: item = {"N":value} elif type(value) is dict: item = {"M":py_dict_to_db_map(value)} # item = py_dict_to_db_map(value) elif type(value) is list: item = {"L":py_list_to_db_list(value)} elif type(value) is tuple: item = {"L":py_list_to_db_list(value)} elif type(value) is bytes: item = {"BS":value} elif type(value) is bool: item = {"BOOL":value} elif value is None: item = {"NULL":True} db_list.append(item) return db_list ''' following functions are used for business logic. (to be moved to business logic layer) ''' # function to calculate the estimated cost of the translation def est_cost(n_tokens,rate:float=0.004): return round(rate*n_tokens/1000,4) def create_md_tables(devices): ''' create markdown tables for the articles. Parameters ---------- devices: list list of devices Returns ------- md_text: str ''' md_text = "" md_text += "| Device name | Device Type | Intended Use | \n| --- | --- | --- | \n" for device in devices: md_table = f"| {device['device_name']} | {device['device_type']} | {device['intended_use']} | \n" md_text += md_table return md_text