|
|
--- |
|
|
license: cc |
|
|
--- |
|
|
|
|
|
This is a classifier fine_tuned from camemBERT that takes as input a text and a question and returns 1 if the text is helpful to answer the question and 0 else. |
|
|
the input should be formatted as tokenized_paragraph + sep_token + tokenized_question. |
|
|
|
|
|
class QuestionAnswerDataset(Dataset): |
|
|
def __init__(self, dataframe, tokenizer, max_length=512): |
|
|
self.dataframe = dataframe |
|
|
self.tokenizer = tokenizer |
|
|
self.max_length = max_length |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.dataframe) |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
# Extract data |
|
|
row = self.dataframe.iloc[idx] |
|
|
paragraph = row['paragraph'] |
|
|
is_positive = np.random.random() |
|
|
if is_positive > 0.5: |
|
|
is_positive = 1 |
|
|
else: |
|
|
is_positive = 0 |
|
|
|
|
|
if is_positive: |
|
|
question = row['positive_questions'][np.random.randint(len(row['positive_questions']))] |
|
|
label = 1 |
|
|
else: |
|
|
question = row['negative_questions'][np.random.randint(len(row['negative_questions']))] |
|
|
label = 0 |
|
|
|
|
|
# print('paragraph', paragraph) |
|
|
# print('question', question) |
|
|
|
|
|
# Tokenize |
|
|
tokenized_paragraph = self.tokenizer( |
|
|
paragraph, |
|
|
truncation=True, |
|
|
max_length=self.max_length, |
|
|
return_tensors="pt" |
|
|
) |
|
|
tokenized_question = self.tokenizer( |
|
|
question, |
|
|
truncation=True, |
|
|
max_length=self.max_length, |
|
|
return_tensors="pt" |
|
|
) |
|
|
total_length = tokenized_paragraph['input_ids'].shape[1] + tokenized_question['input_ids'].shape[1]+2 |
|
|
if total_length > self.max_length: |
|
|
tokenized_paragraph['input_ids'] = tokenized_paragraph['input_ids'][:, :self.max_length - tokenized_question['input_ids'].shape[1]-2] |
|
|
tokenized_paragraph['attention_mask'] = tokenized_paragraph['attention_mask'][:, :self.max_length - tokenized_question['input_ids'].shape[1]-2] |
|
|
|
|
|
|
|
|
# print('-'*100) |
|
|
# print('tokenized_paragraph', tokenized_paragraph) |
|
|
# print('tokenized_question', tokenized_question) |
|
|
# print('total_length', total_length) |
|
|
# print('-'*100) |
|
|
|
|
|
sep_token = torch.tensor([[self.tokenizer.sep_token_id]]).to(tokenized_paragraph['input_ids'].device) |
|
|
|
|
|
|
|
|
# Concatenate tokenized inputs |
|
|
tokenized_input_ids = torch.cat((tokenized_paragraph['input_ids'], sep_token, tokenized_question['input_ids']), |
|
|
dim=1) |
|
|
tokenized_attention_mask = torch.cat( |
|
|
[tokenized_paragraph['attention_mask'], torch.ones_like(sep_token), tokenized_question['attention_mask']], |
|
|
dim=1) |
|
|
|
|
|
# print("tokenized_input_ids shape:", tokenized_input_ids.shape) |
|
|
# print("tokenized_attention_mask shape:", tokenized_attention_mask.shape) |
|
|
|
|
|
# Make sure the length does not exceed max_length |
|
|
if tokenized_input_ids.size(1) > self.max_length: |
|
|
tokenized_input_ids = tokenized_input_ids[:, :self.max_length] |
|
|
tokenized_attention_mask = tokenized_attention_mask[:, :self.max_length] |
|
|
|
|
|
return tokenized_input_ids.squeeze(0), tokenized_attention_mask.squeeze(0), torch.tensor(label) |
|
|
|
|
|
|
|
|
# Tokenizer |
|
|
tokenizer = AutoTokenizer.from_pretrained("camembert/camembert-large") |
|
|
|
|
|
#split test_train test_size = 0.2 |
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) |
|
|
|
|
|
|
|
|
# Dataset |
|
|
train_dataset = QuestionAnswerDataset(train_df, tokenizer) |
|
|
test_dataset = QuestionAnswerDataset(test_df, tokenizer) |
|
|
|
|
|
def custom_collate_fn(batch): |
|
|
input_ids = [item[0] for item in batch] |
|
|
attention_masks = [item[1] for item in batch] |
|
|
labels = torch.tensor([item[2] for item in batch]) |
|
|
|
|
|
input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id) |
|
|
attention_masks_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0) |
|
|
|
|
|
return input_ids_padded, attention_masks_padded, labels |
|
|
|
|
|
|
|
|
# DataLoader |
|
|
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=custom_collate_fn) |
|
|
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True, collate_fn=custom_collate_fn) |
|
|
|
|
|
# Assuming 'myDataloader' is your DataLoader |
|
|
for i, (input_ids, attention_masks, labels) in enumerate(train_dataloader): |
|
|
print(f"Batch {i+1}") |
|
|
print("Input IDs:", input_ids) |
|
|
print("Input IDs Shape:", input_ids.shape) |
|
|
print("Attention Masks:", attention_masks) |
|
|
print("Attention Masks Shape:", attention_masks.shape) |
|
|
print("Labels:", labels) |
|
|
print("Labels Shape:", labels.shape) |
|
|
print("-" * 50) |
|
|
|
|
|
# Optionally, stop after the first few batches |
|
|
if i == 1: # Change this number to control how many batches to print |
|
|
break |
|
|
|
|
|
|
|
|
# Model (for binary classification) |
|
|
camembertModel = AutoModelForSequenceClassification.from_pretrained("camembert/camembert-large", num_labels=1) |
|
|
|
|
|
|