| | import json |
| | import os |
| | |
| | import pandas as pd |
| |
|
| | from application import * |
| |
|
| | ''' |
| | following functions are for file manipulation |
| | ''' |
| |
|
| | |
| | def read_pdf(file_path): |
| | |
| | try: |
| | filename = file_path |
| | pdfFileObj = open(file_path, 'rb') |
| | except TypeError: |
| | filename = file_path.name |
| | pdfFileObj = open(file_path.name, 'rb') |
| |
|
| | |
| | pdfReader = PyPDF2.PdfReader(pdfFileObj) |
| |
|
| | |
| | num_pages = len(pdfReader.pages) |
| |
|
| | |
| | text = '' |
| |
|
| | |
| | for page_num in range(num_pages): |
| | page_obj = pdfReader.pages[page_num] |
| | text += page_obj.extract_text () |
| |
|
| | |
| | pdfFileObj.close() |
| |
|
| | text = remove_symbols(text) |
| |
|
| | with open(f"{filename.split('.')[0]}.txt", "w") as f: |
| | f.write(text) |
| |
|
| | |
| | return text, pdfReader.metadata |
| |
|
| | ''' |
| | following functions are for format standard response |
| | ''' |
| |
|
| | |
| | def format_response(code,data): |
| | return { |
| | "statusCode":code, |
| | "headers":{ |
| | "Access-Control-Allow-Origin": "*", |
| | "Content-Type": "application/json" |
| | }, |
| | "body":json.dumps(data), |
| | "isBase64Encoded": False |
| | } |
| |
|
| | ''' |
| | following functions are for string manipulation |
| | ''' |
| |
|
| | |
| | def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]): |
| | for c in remove_char_ls: |
| | text = text.replace(c,"") |
| | |
| | return text |
| |
|
| | |
| | def remove_symbols(text): |
| | |
| | text = text.encode("ascii", "ignore").decode() |
| | |
| | text = text.replace('-\n', '') |
| | return text |
| |
|
| | def str_to_tuple(s): |
| | return tuple(s.replace("(","").replace(")","").split(",")) |
| |
|
| | ''' |
| | following functions are for dynamodb data manipulation |
| | ''' |
| | |
| | def db_map_to_py_dict(db_map): |
| | py_dict = {} |
| | for k,i in db_map.items(): |
| | for l,v in i.items(): |
| | if l == "M": |
| | py_dict[k] = db_map_to_py_dict(v) |
| | elif l == "S": |
| | py_dict[k] = v |
| | elif l == "N": |
| | py_dict[k] = int(v) if float(v)%1 ==0 else float(v) |
| | elif l == "L": |
| | py_dict[k] = db_list_to_py_list(v) |
| | else: |
| | py_dict[k] = v |
| | |
| | return py_dict |
| |
|
| | def db_map_to_pd_dataframe(db_map): |
| | py_dict = db_map_to_py_dict(db_map) |
| | return pd.DataFrame(py_dict) |
| |
|
| | |
| | def py_dict_to_db_map(py_dict): |
| | db_map = {} |
| | for key,value in py_dict.items(): |
| | key = str(key) |
| | if type(value) is str: |
| | db_map[key] = {"S":value} |
| | elif type(value) is int or type(value) is float: |
| | db_map[key] = {"N":value} |
| | elif type(value) is dict: |
| | db_map[key] = {"M":py_dict_to_db_map(value)} |
| | elif type(value) is list: |
| | db_map[key] = {"L":py_list_to_db_list(value)} |
| | |
| | return db_map |
| |
|
| | |
| | def db_list_to_py_list(db_list): |
| | py_list = [] |
| | for d in db_list: |
| | for t,v in d.items(): |
| | if t == "M": |
| | py_list.append(db_map_to_py_dict(v)) |
| | elif t == "L": |
| | py_list.append(db_list_to_py_list(v)) |
| | else: |
| | py_list.append(v) |
| | |
| | return py_list |
| |
|
| | |
| | def py_list_to_db_list(py_list): |
| | db_list = [] |
| | for value in py_list: |
| | if type(value) is str: |
| | item = {"S":value} |
| | elif type(value) is int or float: |
| | item = {"N":value} |
| | elif type(value) is dict: |
| | item = {"M":py_dict_to_db_map(value)} |
| | elif type(value) is list: |
| | item = {"L":py_list_to_db_list(value)} |
| | |
| | db_list.append(item) |
| | |
| | return db_list |
| |
|
| | ''' |
| | following functions are used for business logic. (to be moved to business logic layer) |
| | ''' |
| |
|
| | |
| | def est_cost(n_tokens,rate): |
| | return round(rate*n_tokens/1000,4) |
| |
|