In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install PyArabic



In [3]:
!cp /content/drive/MyDrive/preprocess.py /content/preprocess.py

In [4]:
from transformers import ElectraForQuestionAnswering, ElectraForSequenceClassification, AutoTokenizer, pipeline
from preprocess import ArabertPreprocessor

# Define ArabertPreprocessor if not already defined
prep_object = ArabertPreprocessor("araelectra-base-discriminator")

# Preprocess the question and context
question = ('ماذا اكلت اليوم  ؟')
context = ('''
اليوم اكلت تفاحة''')

# a) Get predictions
qa_modelname = 'ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA'
qa_pipe = pipeline('question-answering', model=qa_modelname, tokenizer=qa_modelname)

QA_input = {
    'question': question,
    'context': context
}
qa_res = qa_pipe(QA_input)
threshold = 0.5 #hyperparameter can be tweaked
## note classification results label0 probability it can be answered label1 probability can't be answered
## if label1 probability > threshold then consider the output of qa_res is empty string else take the qa_res
# b) Load model & tokenizer
qa_model = ElectraForQuestionAnswering.from_pretrained(qa_modelname)
tokenizer = AutoTokenizer.from_pretrained(qa_modelname)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()


In [5]:
# Example of evaluating classification results
print("الجواب:")
print(qa_res['answer'])


الجواب:
تفاحة


In [6]:
!pip install tnkeeh



In [7]:
import tnkeeh as tn

tn.clean_data(file_path = '/content/drive/MyDrive/arabic222_qna_222dataset_3.txt', save_path = '/content/cleaned_data.txt',)

Saving to /content/cleaned_data.txt


In [8]:
with open("/content/cleaned_data.txt", "r", encoding="utf-8") as file:
    data = file.readlines()


In [9]:
contexts = []
questions = []
answers = []

for i in range(0, len(data), 6):
    if len(data) > i+5:
        context = data[i+1].strip()
        question = data[i+3].strip()
        answer = data[i+5].strip()

        if context and question and answer and context != "سؤال :" and question != 'جواب :' and answer != 'السياق :':
            contexts.append(context)
            questions.append(question)
            answers.append(answer)


In [10]:
!pip install accelerate -U



In [11]:
from transformers import ElectraForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
import torch

# Define model and tokenizer
model_name = "ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = ElectraForQuestionAnswering.from_pretrained(model_name)

class QADataset(Dataset):
    def __init__(self, contexts, questions, answers, tokenizer, max_length):
        self.encodings = tokenizer(contexts, questions, truncation=True, padding=True, max_length=max_length)
        self.answers = []
        for i, (context, answer) in enumerate(zip(contexts, answers)):
            answer_start_idx = context.find(answer)
            if answer_start_idx == -1:  # Answer not found in context
                answer_token_start_idx = answer_token_end_idx = -1
            else:
                answer_end_idx = answer_start_idx + len(answer)
                answer_token_start_idx = self.encodings.char_to_token(i, answer_start_idx)
                answer_token_end_idx = self.encodings.char_to_token(i, answer_end_idx)
            self.answers.append((answer_token_start_idx, answer_token_end_idx))

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        start_pos, end_pos = self.answers[idx]
        if start_pos == -1 or end_pos == -1:  # Answer not found in context
            item["start_positions"] = torch.tensor(-100)  # Assign a tensor with special value for 'ignored' spans
            item["end_positions"] = torch.tensor(-100)  # Assign a tensor with special value for 'ignored' spans
        else:
            item["start_positions"] = torch.tensor(start_pos)
            item["end_positions"] = torch.tensor(end_pos) if end_pos is not None else torch.tensor(start_pos)  # Use start position if end position is None
        return item



train_dataset = QADataset(contexts, questions, answers, tokenizer, max_length=512)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./output",  # Specify output directory
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()


Step,Training Loss
500,1.654
1000,1.1397


TrainOutput(global_step=1191, training_loss=1.330872261053769, metrics={'train_runtime': 286.4473, 'train_samples_per_second': 33.21, 'train_steps_per_second': 4.158, 'total_flos': 2272099824055152.0, 'train_loss': 1.330872261053769, 'epoch': 3.0})

In [12]:
# Save the fine-tuned model
trainer.save_model("/content/model")

In [None]:
from transformers import ElectraForQuestionAnswering, ElectraForSequenceClassification, AutoTokenizer, pipeline
from preprocess import ArabertPreprocessor

# Define ArabertPreprocessor if not already defined
prep_object = ArabertPreprocessor("araelectra-base-discriminator")

# Preprocess the question and context
question = ('ماذا اكلت اليوم  ؟')
context = ('''
اليوم اكلت تفاحة''')

# a) Get predictions
qa_modelname = 'ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA'
qa_pipe = pipeline('question-answering', model=qa_modelname, tokenizer=qa_modelname)

QA_input = {
    'question': question,
    'context': context
}
qa_res = qa_pipe(QA_input)
threshold = 0.5 #hyperparameter can be tweaked
## note classification results label0 probability it can be answered label1 probability can't be answered
## if label1 probability > threshold then consider the output of qa_res is empty string else take the qa_res
# b) Load model & tokenizer
qa_model = ElectraForQuestionAnswering.from_pretrained(qa_modelname)
tokenizer = AutoTokenizer.from_pretrained(qa_modelname)