import json import os import PyPDF2 from application import * ''' following functions are for file manipulation ''' # read pdf file and return text def read_pdf(file_path): # open the pdf file try: filename = file_path pdfFileObj = open(file_path, 'rb') except TypeError: filename = file_path.name pdfFileObj = open(file_path.name, 'rb') # create a pdf reader object pdfReader = PyPDF2.PdfReader(pdfFileObj) # get the number of pages in the pdf file num_pages = len(pdfReader.pages) # create an empty string text = '' # iterate through all the pages for page_num in range(num_pages): page_obj = pdfReader.pages[page_num] text += page_obj.extract_text () # close the pdf file object pdfFileObj.close() text = remove_symbols(text) with open(f"{filename.split('.')[0]}.txt", "w") as f: f.write(text) # return the string of text return text, pdfReader.metadata ''' following functions are for format standard response ''' # format standard response for status code and data def format_response(code,data): return { "statusCode":code, "headers":{ "Access-Control-Allow-Origin": "*", "Content-Type": "application/json" }, "body":json.dumps(data), "isBase64Encoded": False } ''' following functions are for string manipulation ''' # format text output by removing excessive characters def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]): for c in remove_char_ls: text = text.replace(c,"") return text # function to remove symbols that are not in unicode def remove_symbols(text): # remove symbols that are not in unicode text = text.encode("ascii", "ignore").decode() # remove the break word new line return text = text.replace('-\n', '') return text def str_to_tuple(s): return tuple(s.replace("(","").replace(")","").split(",")) ''' following functions are for dynamodb data manipulation ''' # convert dynamodb map to python dictionary def db_map_to_py_dict(db_map): py_dict = {} for k,i in db_map.items(): for l,v in i.items(): if l == "M": py_dict[k] = db_map_to_py_dict(v) elif l == "S": py_dict[k] = v elif l == "N": py_dict[k] = int(v) if float(v)%1 ==0 else float(v) elif l == "L": py_dict[k] = db_list_to_py_list(v) else: py_dict[k] = v return py_dict # convert python dictionary to dynamodb map def py_dict_to_db_map(py_dict): db_map = {} for key,value in py_dict.items(): key = str(key) if type(value) is str: db_map[key] = {"S":value} elif type(value) is int or type(value) is float: db_map[key] = {"N":value} elif type(value) is dict: db_map[key] = {"M":py_dict_to_db_map(value)} elif type(value) is list: db_map[key] = {"L":py_list_to_db_list(value)} return db_map # convert dynamodb list to python list def db_list_to_py_list(db_list): py_list = [] for d in db_list: for t,v in d.items(): if t == "M": py_list.append(db_map_to_py_dict(v)) elif t == "L": py_list.append(db_list_to_py_list(v)) else: py_list.append(v) return py_list # convert python list to dynamodb list def py_list_to_db_list(py_list): db_list = [] for value in py_list: if type(value) is str: item = {"S":value} elif type(value) is int or float: item = {"N":value} elif type(value) is dict: item = {"M":py_dict_to_db_map(value)} elif type(value) is list: item = {"L":py_list_to_db_list(value)} db_list.append(item) return db_list ''' following functions are used for business logic. (to be moved to business logic layer) ''' # function to calculate the estimated cost of the translation def est_cost(n_tokens,rate): return round(rate*n_tokens/1000,4)