commit files to HF hub

Browse files

Files changed (5) hide show

agri_custom_pipeline.py +207 -0
config.json +10 -1
pytorch_model.bin +2 -2
tokenizer.json +0 -0
tokenizer_config.json +5 -6

agri_custom_pipeline.py ADDED Viewed

	@@ -0,0 +1,207 @@

+from transformers import AutoTokenizer, BertForSequenceClassification, Pipeline
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import stopwords
+from nltk.corpus import wordnet
+import numpy as np
+import warnings
+import string
+import torch
+import nltk
+import re
+# Download necessary NLTK packages
+nltk.download('averaged_perceptron_tagger')
+nltk.download("stopwords")
+nltk.download('wordnet')
+nltk.download('punkt')
+# Supress warning
+warnings.filterwarnings('ignore')
+# pre-processing modules
+class RemovePunctuation:
+    """
+    class to remove the corresponding punctuation from the list of punctuations
+    """
+    def __init__(self):
+        """
+        :param empty: None
+        """
+        self.punctuation = string.punctuation
+    def __call__(self, punctuations):
+        """
+        Apply the transformations above.
+        :param punctuation: take the single punctuation(in my case '?')
+        :return: transformed punctuation list, excluding the '?'
+        """
+        if type(punctuations) == str:
+            punctuations = list(punctuations)
+        for punctuation in punctuations:
+            self.punctuation = self.punctuation.translate(str.maketrans('', '', punctuation))
+        return self.punctuation
+# Accessing the remove_punctuation object
+remove_punctuation = RemovePunctuation()
+def get_wordnet_pos(tag):
+    if tag.startswith('J'):
+        return wordnet.ADJ
+    elif tag.startswith('V'):
+        return wordnet.VERB
+    elif tag.startswith('N'):
+        return wordnet.NOUN
+    elif tag.startswith('R'):
+        return wordnet.ADV
+    else:
+        return wordnet.NOUN  # Default to Noun if the part of speech is not recognized
+class ProcessText(object):
+    @staticmethod
+    def remove_punctuation_text(text):
+        """custom function to remove the punctuation"""
+        res = (re.findall(r'\w+|[^\s\w]+', text))
+        name = []
+        for word in res:
+            clean_word = word.translate(str.maketrans('', '', remove_punctuation("")))
+            if clean_word != "":
+                name.append(clean_word)
+        return " ".join(name)
+    @staticmethod
+    def remove_stopwords(text):
+        stop_words = set(stopwords.words('english'))
+        words = word_tokenize(text)
+        filtered_words = [word for word in words if word.lower() not in stop_words]
+        return ' '.join(filtered_words)
+    @staticmethod
+    def lower_casing(text):
+        text_lower = text.lower()
+        return text_lower
+    @staticmethod
+    def lemmatize_text(text):
+        lemmatizer = WordNetLemmatizer()
+        words = word_tokenize(text)
+        tagged_words = nltk.pos_tag(words)
+        lemmatized_words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in tagged_words]
+        return ' '.join(lemmatized_words)
+    @staticmethod
+    def remove_duplicates_and_sort(text):
+        # Split the text into individual words
+        words = text.split()
+        # Create a set to store unique words (which automatically removes duplicates)
+        unique_words = set(words)
+        # Sort the unique words based on their original order in the text
+        sorted_unique_words = sorted(unique_words, key=lambda x: words.index(x))
+        # Join the sorted unique words back into a string with space as separator
+        sorted_text = ' '.join(sorted_unique_words)
+        return sorted_text
+    @staticmethod
+    def remove_numbers(text):
+        # Use regex to replace all numbers with an empty string
+        cleaned_text = re.sub(r'\d+', '', text)
+        return cleaned_text
+    @staticmethod
+    def include_words_with_len_greater_than_2(text):
+        # Split the text into words
+        words = text.split()
+        # Filter out words with length greater than 2
+        filtered_words = [word for word in words if len(word) > 2]
+        # Join the filtered words back into a text
+        cleaned_text = ' '.join(filtered_words)
+        return cleaned_text
+    def __call__(self, text):
+        # remove any punctuation
+        text = self.remove_punctuation_text(text)
+        # Covert text into lower case
+        text = self.lower_casing(text)
+        # Stopwords such as "is", "the", etc that coney no meaning are removed
+        text = self.remove_stopwords(text)
+        # Lemmatization is done for converting words to their base or root form, considering their context and part of speech.
+        text = self.lemmatize_text(text)
+        # Since words are independent to one another in our problem scenario we can sort the text by word and remove any kind of duplicacy
+        text = self.remove_duplicates_and_sort(text)
+        cleaned_text = self.include_words_with_len_greater_than_2(self.remove_numbers(text))
+        return cleaned_text
+def write_csv(file_path, rows):
+    with open(file_path, "w", newline="", encoding="utf-8") as data_file:
+        # create the csv writer object
+        csv_writer = csv.writer(data_file, lineterminator="\n")
+        # write to the same file
+        csv_writer.writerows(rows)
+# custom inference pipeline
+class AgriClfPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "text" in kwargs:
+            preprocess_kwargs["text"] = kwargs["text"]
+        return preprocess_kwargs, {}, {}
+    def preprocess(self, text, **kwargs):
+        textPre_processing = ProcessText()
+        processed_description = textPre_processing(text)
+        try:
+            if type(processed_description) == str:
+                tokenizer = AutoTokenizer.from_pretrained("divyanshu94/agriBERT_clfModel")
+                processed_description = str(processed_description)
+                predToken = tokenizer.encode(processed_description, add_special_tokens=True)
+                max_len = 155
+                padded_predToken = np.array([predToken + [0]*(max_len-len(predToken))])
+                predAttention_mask = np.where(padded_predToken != 0, 1, 0)
+                input_idsPred = torch.tensor(padded_predToken)
+                attention_maskPred = torch.tensor(predAttention_mask)
+                return {"input_idsPred": input_idsPred, "attention_maskPred": attention_maskPred}
+        except Exception as error:
+            print("{}".format(str(error)))
+            return -1
+    def _forward(self, model_inputs):
+        input_idsPred = model_inputs["input_idsPred"]
+        attention_maskPred = model_inputs["attention_maskPred"]
+        self.model = self.model.to("cuda")  # Ensure model is on CUDA if available
+        with torch.no_grad():
+            output = self.model(input_idsPred.to("cuda"), token_type_ids=None, attention_mask=attention_maskPred.to("cuda"))
+        prediction = 1 if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 0
+        return {"logits": "agri" if prediction == 1 else "non-agri"}
+    def postprocess(self, model_outputs, **kwargs):
+        return model_outputs["logits"]

config.json CHANGED Viewed

@@ -1,10 +1,19 @@
 {
-  "_name_or_path": "bert-base-uncased",
   "architectures": [
     "BertForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
   "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,

 {
+  "_name_or_path": "divyanshu94/agriBERT_clfModel",
   "architectures": [
     "BertForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
+  "custom_pipelines": {
+    "agri-classification": {
+      "impl": "agri_custom_pipeline.AgriClfPipeline",
+      "pt": [
+        "BertForSequenceClassification"
+      ],
+      "tf": []
+    }
+  },
   "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:45309f28b751ad0638bfd0282d791aea0a1c9cebca6c91fcbe32fed149104c6b
-size 438003505

 version https://git-lfs.github.com/spec/v1
+oid sha256:6a8d8c2c691a6ff22dc0c8e73f309e492cdeeaf4e0c956731b9ee9123e3f5846
+size 438000689

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -6,7 +6,7 @@
       "normalized": false,
       "rstrip": false,
       "single_word": false,
-      "special": false
     },
     "100": {
       "content": "[UNK]",
@@ -14,7 +14,7 @@
       "normalized": false,
       "rstrip": false,
       "single_word": false,
-      "special": false
     },
     "101": {
       "content": "[CLS]",
@@ -22,7 +22,7 @@
       "normalized": false,
       "rstrip": false,
       "single_word": false,
-      "special": false
     },
     "102": {
       "content": "[SEP]",
@@ -30,7 +30,7 @@
       "normalized": false,
       "rstrip": false,
       "single_word": false,
-      "special": false
     },
     "103": {
       "content": "[MASK]",
@@ -38,7 +38,7 @@
       "normalized": false,
       "rstrip": false,
       "single_word": false,
-      "special": false
     }
   },
   "additional_special_tokens": [],
@@ -54,6 +54,5 @@
   "strip_accents": null,
   "tokenize_chinese_chars": true,
   "tokenizer_class": "DistilBertTokenizer",
-  "tokenizer_file": "/root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/tokenizer.json",
   "unk_token": "[UNK]"
 }

       "normalized": false,
       "rstrip": false,
       "single_word": false,
+      "special": true
     },
     "100": {
       "content": "[UNK]",
       "normalized": false,
       "rstrip": false,
       "single_word": false,
+      "special": true
     },
     "101": {
       "content": "[CLS]",
       "normalized": false,
       "rstrip": false,
       "single_word": false,
+      "special": true
     },
     "102": {
       "content": "[SEP]",
       "normalized": false,
       "rstrip": false,
       "single_word": false,
+      "special": true
     },
     "103": {
       "content": "[MASK]",
       "normalized": false,
       "rstrip": false,
       "single_word": false,
+      "special": true
     }
   },
   "additional_special_tokens": [],
   "strip_accents": null,
   "tokenize_chinese_chars": true,
   "tokenizer_class": "DistilBertTokenizer",
   "unk_token": "[UNK]"
 }