Spaces:

GT4SD
/

PatentToolkit

Runtime error

App Files Files Community

thepolymerguy commited on Apr 8, 2023

Commit

8c44dfd

1 Parent(s): e6827a9

Create tridentmodel.py

Browse files

Files changed (1) hide show

tridentmodel.py +241 -0

tridentmodel.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# -*- coding: utf-8 -*-
+"""TridentModel.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1u07dSU0DoKnNzGzySXMTisXnaloqpUEO
+TRIDENT MODEL IMPLEMENTATION
+Date: 14 January 2023
+Authors:  Egheosa Ogbomo & Amran Mohammed (The Polymer Guys)
+Description: This script combines three ML-based models to identify whether an input text is related to green plastics or not.
+"""
+pip install transformers
+########## IMPORTING REQUIRED PYTHON PACKAGES ##########
+import pandas as pd
+import tensorflow as tf
+import numpy as np
+import matplotlib.pyplot as plt
+from transformers import AutoTokenizer, AutoModel
+import torch
+import math
+import time
+import csv
+import pandas as pd
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+nltk.download('stopwords')
+nltk.download('punkt')
+import string
+########## DEFINING FUNCTIONS FOR MODEL IMPLEMENTATIONS ##########
+### Input data cleaner
+all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
+extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
+all_stopwords.extend(extra_stopwords)
+def clean_data(input, type='Dataframe'):
+    """
+    As preparation for use with the text similarity model, this function removes superfluous data from either a dataframe full of
+    classifications, or an input string, in order for embeddings to be calculated for them. Removes:
+    •	Entries with missing abstracts/descriptions/classifications/typos
+    •	Duplicate entries
+    •   Unnecessary punctuation
+    •	Stop words (e.g., by, a , an, he, she, it)
+    •  	URLs
+    •	All entries are in the same language
+    :param input: Either a dataframe or an individual string
+    :param type: Tells fucntion whether input is a dataframe or an individual string
+    :return: (if dataframe), returns a dataframe containing CPC classfication codes and their associated 'cleaned' description
+    :return:  (if string), returns a 'cleaned' version of the input string
+    """
+    if type == 'Dataframe':
+        cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
+        for i in range(0, len(input)):
+            row_list = input.loc[i, :].values.flatten().tolist()
+            noNaN_row = [x for x in row_list if str(x) != 'nan']
+            listrow = []
+            if len(noNaN_row) > 0:
+                row = noNaN_row[:-1]
+                row = [x.strip() for x in row]
+                row = (" ").join(row)
+                text_tokens = word_tokenize(row)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
+                Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
+                row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
+                removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
+                for char in removechars:
+                    row = list(map(lambda x: x.replace(char, ''), row))
+                row = ''.join(row)
+                wnum = row.split(' ')
+                wnum = [x.lower() for x in wnum]
+                #remove duplicate words
+                wnum = list(dict.fromkeys(wnum))
+                #removing numbers
+                wonum = []
+                for x in wnum:
+                    xv = list(x)
+                    xv = [i.isnumeric() for i in xv]
+                    if True in xv:
+                        continue
+                    else:
+                        wonum.append(x)
+                row = ' '.join(wonum)
+                l = [noNaN_row[-1], row]
+                cleaneddf.loc[len(cleaneddf)] = l
+        cleaneddf = cleaneddf.drop_duplicates(subset=['Description'])
+        cleaneddf.to_csv('E:/Users/eeo21/Startup/CPC_Classifications_List/additionalcleanedclasses.csv', index=False)
+        return cleaneddf
+    elif type == 'String':
+        text_tokens = word_tokenize(input)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
+        Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
+        row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
+        removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
+        for char in removechars:
+            row = list(map(lambda x: x.replace(char, ''), row))
+        row = ''.join(row)
+        wnum = row.split(' ')
+        wnum = [x.lower() for x in wnum]
+        # remove duplicate words
+        wnum = list(dict.fromkeys(wnum))
+        # removing numbers
+        wonum = []
+        for x in wnum:
+            xv = list(x)
+            xv = [i.isnumeric() for i in xv]
+            if True in xv:
+                continue
+            else:
+                wonum.append(x)
+        row = ' '.join(wonum)
+        return row
+### Mean Pooler
+"""
+Performs a mean pooling to reduce dimension of embedding
+"""
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
+### Sentence Embedder
+def sentence_embedder(sentences, model_path):
+  """
+  Calling the sentence similarity model to generate embeddings on input text.
+  :param sentences: takes input text in the form of a string
+  :param model_path: path to the text similarity model
+  :return returns a (1, 384) embedding of the input text
+  """
+  tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
+  model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
+  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+  # Compute token embeddings
+  with torch.no_grad():
+    model_output = model(**encoded_input)
+  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
+  return sentence_embeddings
+### Sentence Embedding Preparation Function
+def convert_saved_embeddings(embedding_string):
+    """
+    Preparing pre-computed embeddings for use for comparison with new abstract embeddings .
+    Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity.
+    :param embedding_string:
+    :return: Should be a single tensor with dims (,384) in string formate
+    """
+    embedding = embedding_string.replace('(', '')
+    embedding = embedding.replace(')', '')
+    embedding = embedding.replace('[', '')
+    embedding = embedding.replace(']', '')
+    embedding = embedding.replace('tensor', '')
+    embedding = embedding.replace(' ', '')
+    embedding = embedding.split(',')
+    embedding = [float(x) for x in embedding]
+    embedding = np.array(embedding)
+    embedding = np.expand_dims(embedding, axis=0)
+    embedding = torch.from_numpy(embedding)
+    return embedding
+### Generating Class Embeddings
+Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
+def class_embbedding_generator(classes):
+    """
+    This function is to be used to generate and save class embeddings
+    Takes an input of 'cleaned' classes, generated by clean_data function, and computes vector representations of these classes (the embeddings) and saves them to csv
+    :classes: Classes should be a dataframe including all of broad scope classes that are intended to be used to make comparisons with
+    """
+    class_embeddings = pd.DataFrame(columns=['Class', 'Description', 'Embedding'])
+    for i in range(len(classes)):
+        class_name = classes.iloc[i, 0]
+        print(class_name)
+        class_description = classes.iloc[i, 1]
+        class_description_embedding = sentence_embedder(class_description, Model_Path)
+        class_description_embedding = class_description_embedding.numpy()
+        class_description_embedding = torch.from_numpy(class_description_embedding)
+        embedding_entry = [class_name, class_description, class_description_embedding]
+        class_embeddings.loc[len(class_embeddings)] = embedding_entry
+### Broad Scope Classifier
+Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
+def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
+    """
+    Takes in pre-computed class embeddings and abstract texts, converts abstract text into
+    :param class_embeddings: dataframe of class embeddings
+    :param abstract: a single abstract embedding
+    :param N: N highest matching classes to return, from highest to lowest, default is 5
+    :return: predictions: a full dataframe of all the predictions on the 9500+ classes, HighestSimilarity: Dataframe of the N most similar classes
+    """
+    predictions = pd.DataFrame(columns=['Class Name', 'Score'])
+    for i in range(len(class_embeddings)):
+        class_name = class_embeddings.iloc[i, 0]
+        embedding = class_embeddings.iloc[i, 2]
+        embedding = convert_saved_embeddings(embedding)
+        abstract_embedding = abstract_embedding.numpy()
+        abstract_embedding = torch.from_numpy(abstract_embedding)
+        cos = torch.nn.CosineSimilarity(dim=1)
+        score = cos(abstract_embedding, embedding).numpy().tolist()
+        result = [class_name, score[0]]
+        predictions.loc[len(predictions)] = result
+    greenpredictions = predictions.tail(52)
+    if Sensitivity == 'High':
+        Threshold = 0.5
+    elif Sensitivity == 'Medium':
+        Threshold = 0.40
+    elif Sensitivity == 'Low':
+        Threshold = 0.35
+    GreenLikelihood = 'False'
+    for i in range(len(greenpredictions)):
+        score = greenpredictions.iloc[i, 1]
+        if float(score) >= Threshold:
+            GreenLikelihood = 'True'
+            break
+        else:
+            continue
+    HighestSimilarity = predictions.nlargest(N, ['Score'])
+    print(HighestSimilarity)
+    print(GreenLikelihood)
+    return predictions, HighestSimilarity, GreenLikelihood
+########## LOADING PRE-COMPUTED EMBEDDINGS ##########
+class_embeddings = pd.read_csv('ClassEmbedd/MainClassEmbeddings.csv')
+abstract = """
+Described herein are strength characteristics and biodegradation of articles produced using one or more “green” sustainable polymers and one or more carbohydrate-based polymers. A compatibilizer can optionally be included in the article. In some cases, the article can include a film, a bag, a bottle, a cap or lid therefore, a sheet, a box or other container, a plate, a cup, utensils, or the like.
+"""
+abstract= clean_data(abstract, type='String')
+abstract_embedding = sentence_embedder(abstract, Model_Path)
+Number = 10
+broad_scope_predictions = broad_scope_class_predictor(class_embeddings, abstract_embedding, Number, Sensitivity='High')
+print(broad_scope_class_predictor)