marblex / app.py
Oranblock's picture
Update app.py
d6bb7e9 verified
import os
import json
import logging
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from huggingface_hub import HfFolder
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Set cache directory to a writable location
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
def setup_training():
logging.info("Starting the training setup process")
# Load configuration
with open('config.json', 'r') as f:
config = json.load(f)
logging.info(f"Loaded configuration: {config}")
# Load your dataset with the specified configuration
logging.info("Loading the MarbleX dataset")
dataset = load_dataset("Oranblock/marblex_dataset", "config1") # Replace "config1" with the appropriate config name
# Print dataset structure
logging.info(f"Dataset columns: {dataset['train'].column_names}")
# Determine the number of unique labels
unique_labels = dataset['train'].unique(config['target_column'])
num_labels = len(unique_labels)
logging.info(f"Number of unique labels: {num_labels}")
logging.info(f"Dataset loaded. Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}")
# Load tokenizer and model
logging.info(f"Loading tokenizer and model: {config['model_name']}")
tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
model = AutoModelForSequenceClassification.from_pretrained(
config['model_name'],
num_labels=num_labels
)
# Tokenize the dataset
logging.info("Tokenizing the dataset")
def tokenize_function(examples):
# Concatenate all feature columns into a single input
features = np.stack([examples[col] for col in config['text_columns']], axis=1)
return tokenizer(features.tolist(), padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
logging.info("Dataset tokenization completed")
# Set up training arguments
logging.info("Setting up training arguments")
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=config['num_train_epochs'],
per_device_train_batch_size=config['per_device_train_batch_size'],
per_device_eval_batch_size=config['per_device_eval_batch_size'],
warmup_ratio=config['warmup_ratio'],
weight_decay=config['weight_decay'],
learning_rate=config['learning_rate'],
fp16=config['fp16'],
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=config['push_to_hub'],
hub_model_id=config['hub_model_id'],
logging_dir='./logs',
logging_steps=100,
)
# Initialize Trainer
logging.info("Initializing Trainer")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"], # Use 'test' split for evaluation
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
# Start training
logging.info("Starting the training process")
trainer.train()
# Evaluate the model
logging.info("Evaluating the model")
eval_results = trainer.evaluate()
logging.info(f"Evaluation results: {eval_results}")
# Push model to hub if configured
if config['push_to_hub']:
logging.info("Pushing model to Hugging Face Hub")
trainer.push_to_hub()
logging.info(f"Model pushed to {config['hub_model_id']}")
logging.info("Training process completed")
if __name__ == "__main__":
# Set Hugging Face token
hf_token = os.environ.get('HF_TOKEN')
if hf_token:
HfFolder.save_token(hf_token)
logging.info("Hugging Face token set")
else:
logging.warning("HF_TOKEN not found in environment variables")
setup_training()