ennioferreirab commited on Aug 1, 2022

Commit

ac03f85

1 Parent(s): d67226c

add model

Browse files

Files changed (21) hide show

.gitattributes +5 -0
1_Pooling/config.json +7 -0
README.md +126 -0
config.json +32 -0
config_sentence_transformers.json +7 -0
modules.json +14 -0
pytorch_model.bin +3 -0
sentence_bert_config.json +4 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +15 -0
training_assets/2_train_sts_cross_bm25.py +43 -0
training_assets/cross_silver_scores_v3.pkl +3 -0
training_assets/gold_eval_dataloader.pkl +3 -0
training_assets/gold_sample_index.txt +825 -0
training_assets/gold_train_dataloader.pkl +3 -0
training_assets/request_solr.py +64 -0
training_assets/silver_cross_samples.pkl +3 -0
training_assets/silver_data.pkl +3 -0
training_assets/train_augmented_bert.ipynb +0 -0
vocab.txt +0 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+training_assets/cross_silver_scores_v3.pkl filter=lfs diff=lfs merge=lfs -text
+training_assets/silver_cross_samples.pkl filter=lfs diff=lfs merge=lfs -text
+training_assets/silver_data.pkl filter=lfs diff=lfs merge=lfs -text
+training_assets/gold_eval_dataloader.pkl filter=lfs diff=lfs merge=lfs -text
+training_assets/gold_train_dataloader.pkl filter=lfs diff=lfs merge=lfs -text

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 768,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

README.md ADDED Viewed

	@@ -0,0 +1,126 @@

+---
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- transformers
+---
+# {MODEL_NAME}
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+<!--- Describe your model here -->
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('{MODEL_NAME}')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Usage (HuggingFace Transformers)
+Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# Sentences we want sentence embeddings for
+sentences = ['This is an example sentence', 'Each sentence is converted']
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}')
+model = AutoModel.from_pretrained('{MODEL_NAME}')
+# Tokenize sentences
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+# Perform pooling. In this case, mean pooling.
+sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+print("Sentence embeddings:")
+print(sentence_embeddings)
+```
+## Evaluation Results
+<!--- Describe how your model was evaluated -->
+For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
+## Training
+The model was trained with the parameters:
+**DataLoader**:
+`torch.utils.data.dataloader.DataLoader` of length 80805 with parameters:
+```
+{'batch_size': 8, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
+```
+**Loss**:
+`sentence_transformers.losses.CosineSimilarityLoss.CosineSimilarityLoss`
+Parameters of the fit()-Method:
+```
+{
+    "epochs": 3,
+    "evaluation_steps": 10000,
+    "evaluator": "__main__.EmbeddingSimilarityEvaluator",
+    "max_grad_norm": 1,
+    "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
+    "optimizer_params": {
+        "lr": 2e-05
+    },
+    "scheduler": "WarmupLinear",
+    "steps_per_epoch": null,
+    "warmup_steps": 24242,
+    "weight_decay": 0.01
+}
+```
+## Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel
+  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
+)
+```
+## Citing & Authors
+<!--- Describe where people can find more information -->

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "/content/drive/MyDrive/Colab_Notebooks/Anatel-gdrive/anatel_train_sts_cross_bm25/augmented-bert-portuguese-anatel-last-train/",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 29794
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.2.2",
+    "transformers": "4.21.0",
+    "pytorch": "1.12.0+cu113"
+  }
+}

modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  }
+]

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4c080e8cd37d8c8cb7b923fd7e780bfb4ad3ea20ab40edc4083f8124c8e29bd
+size 435761969

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 512,
+  "do_lower_case": false
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "name_or_path": "/content/drive/MyDrive/Colab_Notebooks/Anatel-gdrive/anatel_train_sts_cross_bm25/augmented-bert-portuguese-anatel-last-train/",
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": "/root/.cache/huggingface/transformers/eecc45187d085a1169eed91017d358cc0e9cbdd5dc236bcd710059dbf0a2f816.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

training_assets/2_train_sts_cross_bm25.py ADDED Viewed

	@@ -0,0 +1,43 @@

+#%%
+from tqdm import tqdm
+from request_solr import SilverDataset
+from sentence_transformers.cross_encoder import CrossEncoder
+import joblib
+from solr_query_params import params
+############################################################################
+#
+# https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/data_augmentation/train_sts_indomain_bm25.py
+# Step 2: Label BM25 sampled STSb (silver dataset) using cross-encoder model
+#
+############################################################################
+cross_encoder_path = 'ennioferreirab/cross-encoder-pt-anatel-metadados-assunto'
+gold_sample_index = set()
+with open('gold_sample_index.txt', 'r') as f:
+    for line in f:
+        gold_sample_index.add(line.strip())
+        7
+try:
+    joblib.load('silver_data_v2.pkl')
+except:
+    print('Creating silver data...')
+    silver_data = SilverDataset(query_params=params,duplicated=gold_sample_index).run()
+    joblib.dump(silver_data, 'silver_data_v2.pkl')
+    print('Done!')
+sentences = [(sent_1,sent_2) for sent_1, sent_2, _ in silver_data]
+cross_encoder = CrossEncoder(cross_encoder_path,max_length=512)
+cross_silver_scores = []
+for i in tqdm(sentences):
+    cross_silver_scores.append(cross_encoder.predict(i))
+import numpy as np
+cross_silver_data = np.c_[np.array(silver_data),np.array(cross_silver_scores)]
+# All model predictions should be between [0,1]
+assert all(0.0 <= score <= 1.0 for score in cross_silver_scores)
+joblib.dump(cross_silver_data, 'cross_silver_scores_2.pkl')

training_assets/cross_silver_scores_v3.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd9d6a0296f0a1e9589ac8550d6095d9f53985ecd3fc3a8f1e4398426acb84d0
+size 239383791

training_assets/gold_eval_dataloader.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8901155d353af2a1fad078daafb7eabab5bc6779d69ddb3768a359ac2b50bdad
+size 127396

training_assets/gold_sample_index.txt ADDED Viewed

	@@ -0,0 +1,825 @@

+4460666
+4646076
+4740485
+4841542
+5137102
+5433651
+5619579
+6730044
+6960718
+6983639
+7331138
+7460176
+8137405
+5133708
+5591405
+6098115
+6200408
+6816332
+7073711
+8252830
+5333260
+5545291
+5903862
+6070901
+6269728
+6391200
+6710641
+6763601
+5808966
+6164703
+5809430
+6379512
+8182456
+5177401
+5510278
+6083218
+6270034
+6543759
+5369628
+6099099
+6174942
+7902136
+5340149
+5401404
+5981567
+6465290
+6643010
+7835158
+7889240
+8084176
+5433923
+5498005
+5516542
+5809052
+5855590
+5985223
+6110110
+6151666
+6153056
+6439888
+6525429
+6556589
+6635224
+6986996
+7080918
+7114149
+7128483
+7886766
+8131931
+4682622
+5164923
+5503680
+5920921
+6209634
+6422257
+6872536
+7427806
+7986978
+7994954
+5774177
+5989286
+7508398
+5591454
+6102860
+5637632
+5859904
+4370203
+4949468
+5426346
+5859209
+6266408
+5387595
+5465052
+5521946
+5991666
+6209282
+6491862
+6548583
+7494666
+7595318
+8133086
+5634836
+5850384
+5863288
+6398949
+6635184
+6904804
+6975680
+7969196
+8163454
+6887423
+4479704
+5804564
+4646057
+4649560
+4895445
+5182219
+5205994
+5504036
+5864758
+6960731
+7337160
+8158609
+8165279
+8192206
+5766461
+6112431
+8179460
+3674750
+5230043
+6781048
+6862078
+8133237
+5608073
+5640398
+5987322
+6103319
+7348722
+7599312
+7889440
+7954697
+8151933
+6081741
+5261625
+5856321
+6464888
+6520857
+6779057
+6913683
+7090008
+7348871
+7410868
+8091065
+8131423
+8231168
+5550518
+8163535
+7783020
+8150022
+5992730
+6866793
+8139859
+7016421
+8213148
+4740150
+5805453
+4646084
+5174687
+5550543
+6010237
+6875009
+7030819
+7421517
+7492466
+7493941
+7522041
+7602469
+7650684
+7659691
+7991553
+8090644
+4486736
+5482615
+5551413
+5208352
+5546726
+5683883
+5689258
+5871416
+5920031
+4574013
+4888346
+5183619
+5504640
+5984725
+6026816
+6415398
+8141389
+7114643
+4620965
+5164684
+5406485
+5586960
+5855047
+6209691
+6392602
+8183810
+5276276
+5795308
+5327818
+4596269
+5272298
+5430104
+5613420
+5779154
+6294837
+6860757
+8154715
+8163261
+8258448
+4007242
+5326368
+6035394
+6108765
+6317765
+6791523
+6974773
+8152457
+8190516
+6712858
+6465138
+6664825
+8163904
+7892776
+5756069
+8205392
+7203259
+5599430
+5350003
+5855756
+6270070
+7145585
+5091384
+5876381
+5957940
+6108724
+6421665
+6620134
+7209796
+8151360
+4087034
+4531381
+4596045
+4855347
+5162839
+5235040
+5253056
+5327778
+5329186
+5401428
+5617203
+5625555
+5675553
+5773205
+5843647
+5872380
+5989087
+6215135
+6439752
+6482537
+6537688
+6604972
+6729495
+8133210
+5272072
+5516186
+7629938
+7423420
+5159992
+6980613
+6627386
+3913260
+5515029
+7630311
+5685194
+6033758
+6409793
+4595837
+3862776
+5540750
+6032434
+5930452
+5971699
+6548457
+5511835
+5563594
+6405254
+6958561
+5408482
+5502073
+5659569
+6072736
+6418523
+7126761
+4418008
+4460858
+5345610
+1825442
+2051705
+7650052
+5314776
+5571539
+4165786
+4551806
+3973194
+4304284
+4663854
+5211035
+5935558
+5852416
+3866461
+1984797
+4348571
+2667288
+2875287
+2963171
+3961144
+4096818
+4209438
+4409058
+4602656
+5238188
+5240637
+5481769
+5486463
+5497117
+5502194
+5502389
+5502390
+5627659
+5641429
+5673797
+5674040
+5677426
+5722966
+5858953
+6030890
+6057740
+6081452
+6092841
+6397111
+6758807
+7760525
+7889990
+8187830
+6733763
+5256783
+5454894
+6254439
+5430395
+5564934
+5790594
+7407966
+8120458
+3809160
+3947410
+4280956
+4950329
+5159954
+7245076
+7994916
+5476366
+5392611
+7296149
+4675474
+7771746
+4162522
+4540151
+5516359
+5586556
+5512844
+5557217
+4712460
+6593592
+8050832
+8235082
+8283799
+7236674
+5887216
+5990578
+7523737
+7880724
+8199567
+5637600
+4201008
+4406618
+8249957
+5591666
+6022264
+1827408
+2605566
+3091273
+3691667
+5807405
+5034084
+6428700
+5568958
+6021279
+6275467
+7075117
+6088741
+6446079
+3678407
+3892052
+3802952
+4660263
+5523325
+5476893
+5956550
+6074852
+6391212
+0888264
+1226636
+1325045
+1796335
+3863377
+4190087
+4872263
+5639415
+4598502
+4938680
+5394225
+5941839
+5986252
+6831056
+7389491
+5518440
+5328013
+5795782
+6405189
+6910804
+7428552
+3912783
+4334946
+4750544
+5428246
+5534921
+8232931
+5627612
+5882930
+5924285
+5564068
+6115268
+6603458
+5503797
+5591747
+6009845
+8161801
+4287397
+6508273
+6707346
+8196196
+5608601
+6020852
+6105662
+6529304
+4024170
+8132569
+5236024
+5403472
+5397274
+6410230
+6939796
+8182990
+6538571
+3717823
+4141332
+4475895
+6015699
+6090450
+6594521
+6969505
+8131792
+7759380
+5532712
+5886827
+3174130
+3714303
+5016494
+5745123
+6091768
+8164430
+5878425
+4791647
+7517583
+5662890
+3974603
+4086799
+4282564
+5344043
+5414706
+5416591
+5808227
+0975861
+1538893
+1591036
+1838447
+5138501
+6715682
+5863991
+5135182
+5221262
+4922845
+6731643
+7907513
+8053350
+5228986
+6415154
+5141202
+4646096
+5167166
+4772438
+5309058
+5342240
+6400759
+6881639
+6075097
+6614341
+5468988
+5551297
+3503104
+3963399
+4274953
+6825998
+7434061
+7668958
+4870966
+5110222
+5418746
+5418901
+5489424
+6005690
+6106680
+6450841
+6022525
+5171532
+7504807
+7752278
+6420131
+4779688
+4898526
+5134283
+5262888
+4648476
+7521721
+5208120
+6792659
+8085896
+4848964
+7435248
+6161277
+7887662
+6742588
+5626956
+5618086
+5734885
+5090949
+5116239
+5124741
+5159732
+5219306
+5591519
+5676606
+5690219
+5786819
+6041528
+6072192
+7884624
+7981332
+8084930
+5612301
+4731346
+5623910
+6091875
+6281436
+5213783
+6009557
+7039607
+7679918
+5176028
+5560512
+5874435
+6520636
+5468132
+5556839
+6075388
+6167589
+6402939
+6439732
+6604418
+5205819
+5335110
+5384794
+6173780
+7613196
+5161886
+5426013
+5520987
+5559035
+5931478
+4540213
+5167812
+5364524
+5400717
+5547227
+5605539
+5622937
+5639412
+5795790
+6906068
+6947670
+7963695
+6178278
+5101120
+5862886
+5470473
+6002283
+5186421
+5196824
+5226107
+5231329
+5235885
+5245418
+5247129
+5328934
+5338196
+5344932
+5347088
+5355720
+5362812
+5365107
+5376109
+5376117
+5380762
+5393873
+5394467
+5395955
+5415840
+5425280
+5449732
+5503636
+5541083
+5743983
+5747840
+5792689
+5794851
+5839502
+5887460
+5926330
+6009768
+7039563
+8044928
+8216267
+3730523
+4786125
+5211966
+5469180
+6897698
+5138518
+5401485
+5753393
+6180767
+4106645
+6164679
+6391213
+6401090
+7373471
+6916874
+5103812
+5744044
+6021351
+8010147
+4813869
+7949719
+5349964
+5846693
+8064168
+6812336
+5674311
+5153043
+5159853
+5399855
+5424931
+5444239
+5727887
+5945666
+6092852
+2010236
+2097934
+3756637
+3874160
+3976877
+4301951
+5091159
+5420402
+5466778
+5476112
+6158436
+5545661
+6726156
+5920353
+6538842
+6519579
+6880547
+5206272
+6106472
+6516584
+6732221
+3872348
+5116412
+5563281
+3856904
+4428505
+5345839
+6094897
+6419864
+5945139
+4706163
+3725580
+4003539
+4027979
+4736809
+5025370
+5084254
+5211319
+5257759
+5429093
+5441857
+5505327
+5560235
+5602619
+5606581
+5793811
+5807776
+6010825
+6059919
+6081869
+6137061
+6549955
+6614527
+7246382
+4741669
+0908100
+4486339
+5162489
+5493532
+5636837
+6049972
+4047135
+4385386
+8248665
+1402356
+4967941
+5092529
+5214722
+5228115
+5291887
+5519267
+5537326
+5795976
+5796440
+5803348
+5810185
+5852507
+5909067
+5977933
+6796500
+6863557
+7428664
+8152906
+4404049
+2607396
+3183641
+3184217
+3449359
+3727592
+3734172
+3966722
+4137035
+4224913
+4902962
+4969387
+5163826
+5206856
+5239793
+5269996
+5311495
+5338031
+5370069
+5399410
+5425394
+5440378
+5446751
+5456903
+5458292
+5476100
+5482266
+5504203
+5504582
+5524801
+5551055
+5553070
+5554819
+5567170
+5568809
+5669114
+5717111
+5992402
+6000491
+6059906
+6146140
+6351943
+6467278
+6780508
+8197798

training_assets/gold_train_dataloader.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c53ce2f26aab328f08d2db38d11718bb3579048ced91a9acb6a607b95228eaa2
+size 3586422

training_assets/request_solr.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import httpx
+from pydantic import BaseModel
+import tqdm
+'''
+gold_sample_index = set()
+with open('gold_sample_index.txt', 'r') as f:
+    for line in f:
+        gold_sample_index.add(line.strip())
+'''
+class SilverDataset(BaseModel):
+    '''
+    Classe para retornar o dataset silver para ser utilizado pela estratégia do Augmented SBERT
+    ref: https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/data_augmentation/train_sts_indomain_bm25.py
+    '''
+    query_params: dict
+    duplicated: set = set()
+    solr_url: str = "http://localhost:8983/solr/sei_similaridade_augmented_sbert"
+    def get_ids_list(self):
+        query = f'{self.solr_url}/select?q=*&fl=id&rows=999999999'
+        r = httpx.get(query).json()['response']['docs']
+        return [doc['id'] for doc in r]
+    def get_data(self,id):
+        q_id = f"id:{id}"
+        self.query_params['q'] = q_id
+        r = httpx.post(f'{self.solr_url}/mlt', data=self.query_params).json()
+        maxscore = r['response']['maxScore']
+        response_docs =  r['response']['docs']
+        response_docs = self.remove_duplicated(response_docs)
+        return {'query_id': id,
+                'query_doc': r['match']['docs'][0][self.query_params['mlt.qf']],
+                'docs': response_docs,
+                'maxscore': maxscore}
+    def remove_duplicated(self,docs):
+        '''
+        remove os documentos que são iguais aos documentos do dataset Gold
+        '''
+        return [doc for doc in docs if doc['id'] not in self.duplicated]
+    @staticmethod
+    def create_sentence_pairs(queries):
+        '''
+        cria os pares de frases para o dataset silver
+        '''
+        pairs = set()
+        for query in queries:
+            for doc in query['docs']:
+                pairs.add(
+                    (query['query_doc'],
+                    doc['assunto_text'],
+                    doc['score']/query['maxscore']))
+        return pairs
+    def run(self):
+        queries = []
+        list_ids = self.get_ids_list()
+        for id in tqdm.tqdm(list_ids):
+            queries.append(self.get_data(id))
+        pairs = self.create_sentence_pairs(queries)
+        return pairs

training_assets/silver_cross_samples.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6a9119d292328000dd27b5f674e0cf86c708d1b9042a9b8911c03a6726c2e50
+size 239072747

training_assets/silver_data.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68e0f9acd5aea86b2e45ae6eee6840c1df4f70705b21b1b9a535ecae0580e5fc
+size 303024365

training_assets/train_augmented_bert.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff