File size: 5,014 Bytes
6ac6d57 ab42b99 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
---
license: cc
---
This is a classifier fine_tuned from camemBERT that takes as input a text and a question and returns 1 if the text is helpful to answer the question and 0 else.
the input should be formatted as tokenized_paragraph + sep_token + tokenized_question.
class QuestionAnswerDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_length=512):
self.dataframe = dataframe
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.dataframe)
def __getitem__(self, idx):
# Extract data
row = self.dataframe.iloc[idx]
paragraph = row['paragraph']
is_positive = np.random.random()
if is_positive > 0.5:
is_positive = 1
else:
is_positive = 0
if is_positive:
question = row['positive_questions'][np.random.randint(len(row['positive_questions']))]
label = 1
else:
question = row['negative_questions'][np.random.randint(len(row['negative_questions']))]
label = 0
# print('paragraph', paragraph)
# print('question', question)
# Tokenize
tokenized_paragraph = self.tokenizer(
paragraph,
truncation=True,
max_length=self.max_length,
return_tensors="pt"
)
tokenized_question = self.tokenizer(
question,
truncation=True,
max_length=self.max_length,
return_tensors="pt"
)
total_length = tokenized_paragraph['input_ids'].shape[1] + tokenized_question['input_ids'].shape[1]+2
if total_length > self.max_length:
tokenized_paragraph['input_ids'] = tokenized_paragraph['input_ids'][:, :self.max_length - tokenized_question['input_ids'].shape[1]-2]
tokenized_paragraph['attention_mask'] = tokenized_paragraph['attention_mask'][:, :self.max_length - tokenized_question['input_ids'].shape[1]-2]
# print('-'*100)
# print('tokenized_paragraph', tokenized_paragraph)
# print('tokenized_question', tokenized_question)
# print('total_length', total_length)
# print('-'*100)
sep_token = torch.tensor([[self.tokenizer.sep_token_id]]).to(tokenized_paragraph['input_ids'].device)
# Concatenate tokenized inputs
tokenized_input_ids = torch.cat((tokenized_paragraph['input_ids'], sep_token, tokenized_question['input_ids']),
dim=1)
tokenized_attention_mask = torch.cat(
[tokenized_paragraph['attention_mask'], torch.ones_like(sep_token), tokenized_question['attention_mask']],
dim=1)
# print("tokenized_input_ids shape:", tokenized_input_ids.shape)
# print("tokenized_attention_mask shape:", tokenized_attention_mask.shape)
# Make sure the length does not exceed max_length
if tokenized_input_ids.size(1) > self.max_length:
tokenized_input_ids = tokenized_input_ids[:, :self.max_length]
tokenized_attention_mask = tokenized_attention_mask[:, :self.max_length]
return tokenized_input_ids.squeeze(0), tokenized_attention_mask.squeeze(0), torch.tensor(label)
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("camembert/camembert-large")
#split test_train test_size = 0.2
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Dataset
train_dataset = QuestionAnswerDataset(train_df, tokenizer)
test_dataset = QuestionAnswerDataset(test_df, tokenizer)
def custom_collate_fn(batch):
input_ids = [item[0] for item in batch]
attention_masks = [item[1] for item in batch]
labels = torch.tensor([item[2] for item in batch])
input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
attention_masks_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0)
return input_ids_padded, attention_masks_padded, labels
# DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=custom_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True, collate_fn=custom_collate_fn)
# Assuming 'myDataloader' is your DataLoader
for i, (input_ids, attention_masks, labels) in enumerate(train_dataloader):
print(f"Batch {i+1}")
print("Input IDs:", input_ids)
print("Input IDs Shape:", input_ids.shape)
print("Attention Masks:", attention_masks)
print("Attention Masks Shape:", attention_masks.shape)
print("Labels:", labels)
print("Labels Shape:", labels.shape)
print("-" * 50)
# Optionally, stop after the first few batches
if i == 1: # Change this number to control how many batches to print
break
# Model (for binary classification)
camembertModel = AutoModelForSequenceClassification.from_pretrained("camembert/camembert-large", num_labels=1)
|