Spaces:

rajaatif786
/

Ticker

Runtime error

App Files Files Community

rajaatif786 commited on Oct 25, 2024

Commit

d785525

verified ·

1 Parent(s): 656a71b

Upload 5 files

Browse files

Files changed (5) hide show

EntityBertNet +24 -0
config.json +23 -0
labels.pkl +3 -0
labels_map.pkl +3 -0
pipeline.py +195 -0

EntityBertNet ADDED Viewed

	@@ -0,0 +1,24 @@

+class EntityBertNet(nn.Module):
+    def __init__(self):
+        super(EntityBertNet, self).__init__()
+        config = BertConfig.from_pretrained(TRAINED_WEIGHTS)
+        self.bert_base = BertModel.from_pretrained(TRAINED_WEIGHTS, config=config)
+        self.fc = nn.Linear(HIDDEN_OUTPUT_FEATURES, NUM_CLASSES)
+    def forward(self, input_ids, attn_mask, entity_indices):
+        # BERT
+        bert_output, _ = self.bert_base(input_ids=input_ids, attention_mask=attn_mask,return_dict=False)
+        # max pooling at entity locations
+        entity_pooled_output = EntityBertNet.pooled_output(bert_output, entity_indices)
+        # fc layer (softmax activation done in loss function)
+        x = self.fc(entity_pooled_output)
+        return x
+    @staticmethod
+    def pooled_output(bert_output, indices):
+        #print(bert_output)
+        outputs = torch.gather(input=bert_output, dim=1, index=indices)
+        pooled_output, _ = torch.max(outputs, dim=1)
+        return pooled_output

config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.6.0.dev0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

labels.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a894c21df4b8ff39856d53e2d78a203954b6071e82f5541fcc11bb31e0242ef
+size 489655

labels_map.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73e1ea242ff1fad7b45455f772cd033a740e1508d47160a83d1c23a680b16c8e
+size 592260

pipeline.py ADDED Viewed

	@@ -0,0 +1,195 @@

+nltk.download('punkt')
+import pandas as pd
+import string
+from gensim.models.phrases import Phrases, Phraser
+from anytree import Node, RenderTree, PreOrderIter
+from pathos.multiprocessing import ProcessingPool as Pool
+import itertools
+from time import time
+import os
+os.chdir('/content/')
+nltk.download('stopwords')
+import parmap
+os.chdir('/content/')
+device = torch.device('cuda')
+from torch.utils.data import Dataset
+from transformers import BertTokenizer
+import numpy as np
+from ast import literal_eval
+import os.path
+from torch.nn.utils import clip_grad_norm_
+from torch.utils.data import DataLoader
+import time
+import numpy as np
+from sklearn import metrics
+from transformers import get_linear_schedule_with_warmup
+#from agent.target_extraction.BERT.relation_extractor.pair_rel_dataset import PairRelDataset, generate_batch, generate_production_batch
+#from agent.target_extraction.BERT.relation_extractor.pairbertnet import NUM_CLASSES, PairBertNet
+import torch.nn as nn
+from transformers import *
+import time
+from transformers import BertModel
+nltk.download('punkt')
+nltk.download('wordnet')
+nltk.download('omw-1.4')
+device = torch.device('cuda')
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+MAX_SEQ_LEN = 256
+MASK_TOKEN = '[MASK]'
+BATCH_SIZE=32
+def generate_production_batch(batch):
+    tok=[(instance.tokens for instance in batch)]
+    tok=list( itertools.chain.from_iterable(tok))
+    tok=list( itertools.chain.from_iterable([[' '.join(i)] for i in tok]))
+    encoded = tokenizer.__call__(tok, add_special_tokens=True,
+                                          max_length=MAX_SEQ_LEN, pad_to_max_length=True,
+                                          return_tensors='pt')
+    input_ids = encoded['input_ids']
+    attn_mask = encoded['attention_mask']
+    entity_indices = indices_for_entity_ranges([instance.entity_range for instance in batch])
+    return input_ids, attn_mask, entity_indices, batch
+def indices_for_entity_ranges(ranges):
+    max_e_len = max(end - start for start, end in ranges)
+    indices = torch.tensor([[[min(t, end)] * HIDDEN_OUTPUT_FEATURES
+                             for t in range(start, start + max_e_len + 1)]
+                            for start, end in ranges])
+    return indices
+open_file = open(project_dir+"/labels.pkl", "rb")
+LABELS = pickle.load(open_file)
+open_file.close()
+with open(project_dir+'/labels_map.pkl', 'rb') as f:
+    LABEL_MAP = pickle.load(f)
+open_file = open(project_dir+"/labels.pkl", "rb")
+LABELS = pickle.load(open_file)
+open_file.close()
+with open(project_dir+'/labels_map.pkl', 'rb') as f:
+    LABEL_MAP = pickle.load(f)
+class EntityDataset(Dataset):
+    def __init__(self, df, size=None):
+        # filter inapplicable rows
+        self.df = df[df.apply(lambda x: EntityDataset.instance_from_row(x) is not None, axis=1)]
+        print(len(self.df))
+        # sample data if a size is specified
+        if size is not None and size < len(self):
+            self.df = self.df.sample(size, replace=False)
+    @staticmethod
+    def from_df(df, size=None):
+        dataset = EntityDataset(df, size=size)
+        print('Obtained dataset of size', len(dataset))
+        return dataset
+    @staticmethod
+    def instance_from_row(row):
+        unpacked_arr = literal_eval(row['entityMentions']) if type(row['entityMentions']) is str else row['entityMentions']
+       # print("unpacked_arr",str(unpacked_arr))
+        #rms = [rm for rm in unpacked_arr if 'label' not in rm or rm['label'] in LABELS]
+        #if len(rms) == 1:
+        entity= unpacked_arr[0]['text']
+        #else:
+        #return None  # raise AttributeError('Instances must have exactly one relation')
+        text = row['sentText']
+        #print(EntityDataset.get_instance(text, entity, label=label) is not None)
+        return EntityDataset.get_instance(text, entity)
+    @staticmethod
+    def get_instance(text, entity, label=None):
+        tokens = tokenizer.tokenize(text)
+        i = 0
+        found_entity = True
+        entity_range = (0,100)
+        if found_entity:
+            return PairRelInstance(tokens, entity, entity_range, None, text)
+    def __len__(self):
+        return len(self.df.index)
+    def __getitem__(self, idx):
+        return EntityDataset.instance_from_row(self.df.iloc[idx])
+class PairRelInstance:
+    def __init__(self, tokens, entity, entity_range, label, text):
+        self.tokens = tokens
+        self.entity = entity
+        self.entity_range = entity_range
+        self.label = label
+        self.text = text
+#device = torch.device('cpu')
+#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+class PreTrainedPipeline():
+    def __init__(self, path):
+        config = BertConfig.from_pretrained(TRAINED_WEIGHTS)
+        self.model = BertModel.from_pretrained(TRAINED_WEIGHTS, config=config)
+    def __call__(self, inputs)-> Dict[str, str]:
+        return {
+            "text": "hello"
+        }
+class EntityBertNet(nn.Module):
+    def __init__(self):
+        super(EntityBertNet, self).__init__()
+        config = BertConfig.from_pretrained(TRAINED_WEIGHTS)
+        self.bert_base = BertModel.from_pretrained(TRAINED_WEIGHTS, config=config)
+        self.fc = nn.Linear(HIDDEN_OUTPUT_FEATURES, NUM_CLASSES)
+    def forward(self, input_ids, attn_mask, entity_indices):
+        # BERT
+        bert_output, _ = self.bert_base(input_ids=input_ids, attention_mask=attn_mask,return_dict=False)
+        #print(type(bert_output))
+        # max pooling at entity locations
+        entity_pooled_output = EntityBertNet.pooled_output(bert_output, entity_indices)
+        # fc layer (softmax activation done in loss function)
+        x = self.fc(entity_pooled_output)
+        return x
+    @staticmethod
+    def pooled_output(bert_output, indices):
+        #print(bert_output)
+        outputs = torch.gather(input=bert_output, dim=1, index=indices)
+        pooled_output, _ = torch.max(outputs, dim=1)
+        return pooled_output