| | import json |
| |
|
| | from application import * |
| |
|
| | import tiktoken |
| | from pdfminer.high_level import extract_text |
| | from pdfminer.pdfparser import PDFParser |
| | from pdfminer.pdfdocument import PDFDocument |
| |
|
| | token_encoder = tiktoken.get_encoding("cl100k_base") |
| |
|
| | def count_tokens(text): |
| | ''' |
| | this function count the number of tokens in the text |
| | |
| | Parameters |
| | ---------- |
| | text: str |
| | text to be counted |
| | |
| | Returns |
| | ------- |
| | n_tokens: int |
| | number of tokens in the text |
| | ''' |
| | |
| | return len(token_encoder.encode(text)) |
| |
|
| | ''' |
| | following functions are for file manipulation |
| | ''' |
| |
|
| | keyword_search = lambda kw, text: kw.lower() in text.lower() |
| | list_or = lambda l: sum(l)>0 |
| | list_and = lambda l: sum(l)==len(l) |
| |
|
| | |
| | def read_pdf(file_path): |
| | ''' |
| | this function read the pdf file and return the text |
| | |
| | Parameters |
| | ----------''' |
| | |
| | if type(file_path) is str: |
| | filename = file_path |
| | pdfFileObj = open(file_path, 'rb') |
| | |
| | else: |
| | filename = file_path.name |
| | pdfFileObj = open(file_path.name, 'rb') |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | text = extract_text(pdfFileObj) |
| | text = remove_symbols(text) |
| |
|
| | |
| |
|
| | parser = PDFParser(pdfFileObj) |
| | doc = PDFDocument(parser) |
| |
|
| | meta = doc.info |
| | |
| | pdfFileObj.close() |
| |
|
| | return text, meta |
| |
|
| | ''' |
| | following functions are for format standard response |
| | ''' |
| |
|
| | |
| | def format_response(code,data): |
| | return { |
| | "statusCode":code, |
| | "headers":{ |
| | "Access-Control-Allow-Origin": "*", |
| | "Content-Type": "application/json" |
| | }, |
| | "body":json.dumps(data), |
| | "isBase64Encoded": False |
| | } |
| |
|
| | ''' |
| | following functions are for string manipulation |
| | ''' |
| |
|
| | |
| | def format_text(text,remove_char_ls = ["\\n--\\n","\\n\\n","\n"]): |
| | for c in remove_char_ls: |
| | text = text.replace(c,"") |
| | |
| | return text |
| |
|
| | |
| | def remove_symbols(text): |
| | |
| | text = text.encode("ascii", "ignore").decode() |
| | |
| | text = text.replace('-\n', '') |
| | return text |
| |
|
| | def str_to_tuple(s): |
| | return tuple(s.replace("(","").replace(")","").split(",")) |
| |
|
| | ''' |
| | following functions are for dynamodb data manipulation |
| | ''' |
| | |
| | def db_map_to_py_dict(db_map): |
| | py_dict = {} |
| | for k,i in db_map.items(): |
| | for l,v in i.items(): |
| | if l == "M": |
| | py_dict[k] = db_map_to_py_dict(v) |
| | elif l == "S": |
| | py_dict[k] = v |
| | elif l == "N": |
| | py_dict[k] = int(v) if float(v)%1 ==0 else float(v) |
| | elif l == "L": |
| | py_dict[k] = db_list_to_py_list(v) |
| | elif l == "BS": |
| | py_dict[k] = v |
| | elif l == "BOOL": |
| | py_dict[k] = v |
| | elif l =="NULL": |
| | py_dict[k] = None |
| | else: |
| | py_dict[k] = v |
| | |
| | return py_dict |
| |
|
| | |
| | def py_dict_to_db_map(py_dict): |
| | db_map = {} |
| | for key,value in py_dict.items(): |
| | key = str(key) |
| | if type(value) is str: |
| | db_map[key] = {"S":value} |
| | elif type(value) is int or type(value) is float: |
| | db_map[key] = {"N":value} |
| | elif type(value) is dict: |
| | db_map[key] = {"M":py_dict_to_db_map(value)} |
| | elif type(value) is list: |
| | db_map[key] = {"L":py_list_to_db_list(value)} |
| | elif type(value) is bytes: |
| | db_map[key] = {"BS":value} |
| | elif type(value) is bool: |
| | db_map[key] = {"BOOL":value} |
| | elif value is None: |
| | db_map[key] = {"NULL":True} |
| | return db_map |
| |
|
| | |
| | def db_list_to_py_list(db_list): |
| | py_list = [] |
| | for d in db_list: |
| | for t,v in d.items(): |
| | if t == "M": |
| | py_list.append(db_map_to_py_dict(v)) |
| | elif t == "L": |
| | py_list.append(db_list_to_py_list(v)) |
| | elif t =="N" or t =="S" or t =="B" or t =="BOOL" or t =="NULL" or t =="SS" or t =="NS" or t =="BS": |
| | py_list.append(v) |
| | else: |
| | py_list.append(db_map_to_py_dict(v)) |
| | |
| | return py_list |
| |
|
| | |
| | def py_list_to_db_list(py_list): |
| | db_list = [] |
| | for value in py_list: |
| | if type(value) is str: |
| | item = {"S":value} |
| | elif type(value) is int or type(value) is float: |
| | item = {"N":value} |
| | elif type(value) is dict: |
| | item = {"M":py_dict_to_db_map(value)} |
| | |
| | elif type(value) is list: |
| | item = {"L":py_list_to_db_list(value)} |
| | elif type(value) is tuple: |
| | item = {"L":py_list_to_db_list(value)} |
| | elif type(value) is bytes: |
| | item = {"BS":value} |
| | elif type(value) is bool: |
| | item = {"BOOL":value} |
| | elif value is None: |
| | item = {"NULL":True} |
| | |
| | db_list.append(item) |
| | |
| | return db_list |
| |
|
| | ''' |
| | following functions are used for business logic. (to be moved to business logic layer) |
| | ''' |
| |
|
| | |
| | def est_cost(n_tokens,rate:float=0.004): |
| | return round(rate*n_tokens/1000,4) |
| |
|
| | def create_md_tables(devices): |
| | ''' |
| | create markdown tables for the articles. |
| | |
| | Parameters |
| | ---------- |
| | devices: list |
| | list of devices |
| | |
| | Returns |
| | ------- |
| | md_text: str |
| | ''' |
| | md_text = "" |
| | md_text += "| Device name | Device Type | Intended Use | \n| --- | --- | --- | \n" |
| |
|
| | for device in devices: |
| | md_table = f"| {device['device_name']} | {device['device_type']} | {device['intended_use']} | \n" |
| | md_text += md_table |
| |
|
| | return md_text |