|
|
|
|
|
""" |
|
|
Prothom Alo Language Model Trainer |
|
|
Fine-tunes a small language model on Prothom Alo news articles |
|
|
Converts to Safetensors format for distribution |
|
|
""" |
|
|
|
|
|
import os |
|
|
import torch |
|
|
import json |
|
|
import logging |
|
|
from datetime import datetime |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Optional, Tuple |
|
|
|
|
|
|
|
|
from datasets import load_from_disk |
|
|
from transformers import ( |
|
|
AutoTokenizer, |
|
|
AutoModelForCausalLM, |
|
|
Trainer, |
|
|
TrainingArguments, |
|
|
DataCollatorForLanguageModeling, |
|
|
DataCollator |
|
|
) |
|
|
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING |
|
|
from torch.utils.data import Dataset |
|
|
import safetensors.torch |
|
|
from safetensors import safe_open |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class ProthomAloDataset(Dataset): |
|
|
"""Custom dataset for Prothom Alo content""" |
|
|
|
|
|
def __init__(self, dataset, tokenizer, max_length: int = 512): |
|
|
self.dataset = dataset |
|
|
self.tokenizer = tokenizer |
|
|
self.max_length = max_length |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.dataset) |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
item = self.dataset[idx] |
|
|
|
|
|
|
|
|
text = f"Title: {item['title']}\n\nContent: {item['content_clean']}" |
|
|
|
|
|
|
|
|
encoding = self.tokenizer( |
|
|
text, |
|
|
truncation=True, |
|
|
max_length=self.max_length, |
|
|
padding="max_length", |
|
|
return_tensors="pt" |
|
|
) |
|
|
|
|
|
return { |
|
|
"input_ids": encoding["input_ids"].squeeze(), |
|
|
"attention_mask": encoding["attention_mask"].squeeze(), |
|
|
"labels": encoding["input_ids"].squeeze() |
|
|
} |
|
|
|
|
|
class ProthomAloModelTrainer: |
|
|
"""Trainer class for Prothom Alo model fine-tuning""" |
|
|
|
|
|
def __init__(self, model_name: str = "distilgpt2", max_length: int = 512): |
|
|
self.model_name = model_name |
|
|
self.max_length = max_length |
|
|
self.tokenizer = None |
|
|
self.model = None |
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
logger.info(f"Using device: {self.device}") |
|
|
|
|
|
def load_dataset(self, dataset_path: str) -> Tuple[any, any, any]: |
|
|
"""Load and prepare the Prothom Alo dataset""" |
|
|
logger.info(f"Loading dataset from: {dataset_path}") |
|
|
|
|
|
dataset = load_from_disk(dataset_path) |
|
|
logger.info(f"Dataset loaded: {len(dataset['train'])} train, {len(dataset['validation'])} validation, {len(dataset['test'])} test") |
|
|
|
|
|
|
|
|
logger.info(f"Loading tokenizer: {self.model_name}") |
|
|
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) |
|
|
|
|
|
|
|
|
if self.tokenizer.pad_token is None: |
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token |
|
|
|
|
|
|
|
|
train_dataset = ProthomAloDataset(dataset['train'], self.tokenizer, self.max_length) |
|
|
val_dataset = ProthomAloDataset(dataset['validation'], self.tokenizer, self.max_length) |
|
|
|
|
|
return train_dataset, val_dataset, dataset |
|
|
|
|
|
def setup_model(self) -> None: |
|
|
"""Setup the model for training""" |
|
|
logger.info(f"Loading model: {self.model_name}") |
|
|
|
|
|
|
|
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
|
self.model_name, |
|
|
torch_dtype=torch.float32, |
|
|
device_map="auto" if torch.cuda.is_available() else None |
|
|
) |
|
|
|
|
|
|
|
|
self.model.resize_token_embeddings(len(self.tokenizer)) |
|
|
|
|
|
|
|
|
if hasattr(self.model, 'gradient_checkpointing_enable'): |
|
|
self.model.gradient_checkpointing_enable() |
|
|
|
|
|
logger.info(f"Model loaded with {self.model.num_parameters():,} parameters") |
|
|
|
|
|
def train(self, train_dataset, val_dataset, output_dir: str = "prothomalo_model", |
|
|
epochs: int = 3, batch_size: int = 2, learning_rate: float = 5e-5) -> str: |
|
|
"""Train the model on Prothom Alo dataset""" |
|
|
|
|
|
logger.info(f"Starting training for {epochs} epochs") |
|
|
logger.info(f"Training config: batch_size={batch_size}, learning_rate={learning_rate}") |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=output_dir, |
|
|
num_train_epochs=epochs, |
|
|
per_device_train_batch_size=batch_size, |
|
|
per_device_eval_batch_size=batch_size, |
|
|
learning_rate=learning_rate, |
|
|
weight_decay=0.01, |
|
|
logging_steps=1, |
|
|
eval_strategy="epoch", |
|
|
save_strategy="epoch", |
|
|
load_best_model_at_end=True, |
|
|
metric_for_best_model="eval_loss", |
|
|
save_total_limit=2, |
|
|
report_to="none", |
|
|
dataloader_num_workers=0, |
|
|
warmup_steps=10, |
|
|
max_grad_norm=1.0, |
|
|
fp16=False, |
|
|
) |
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
|
tokenizer=self.tokenizer, |
|
|
mlm=False, |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=self.model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=val_dataset, |
|
|
data_collator=data_collator, |
|
|
) |
|
|
|
|
|
|
|
|
logger.info("Starting training...") |
|
|
trainer.train() |
|
|
|
|
|
|
|
|
model_path = f"{output_dir}/final_model" |
|
|
trainer.save_model(model_path) |
|
|
self.tokenizer.save_pretrained(model_path) |
|
|
|
|
|
logger.info(f"Model training completed! Saved to: {model_path}") |
|
|
return model_path |
|
|
|
|
|
def convert_to_safetensors(self, model_path: str, output_path: str = "prothomalo_model.safetensors") -> str: |
|
|
"""Convert the fine-tuned model to Safetensors format""" |
|
|
|
|
|
logger.info(f"Converting model to Safetensors format: {output_path}") |
|
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_path) |
|
|
|
|
|
|
|
|
state_dict = model.state_dict() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for key in list(state_dict.keys()): |
|
|
if 'lm_head.weight' in key: |
|
|
|
|
|
state_dict[key] = state_dict[key].clone() |
|
|
|
|
|
|
|
|
safetensors.torch.save_file(state_dict, output_path, metadata={"format": "pt"}) |
|
|
|
|
|
logger.info(f"Model converted to Safetensors: {output_path}") |
|
|
logger.info(f"File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB") |
|
|
|
|
|
|
|
|
with safe_open(output_path, framework="pt", device=0) as f: |
|
|
keys = list(f.keys()) |
|
|
logger.info(f"Safetensors contains {len(keys)} tensors") |
|
|
|
|
|
return output_path |
|
|
|
|
|
def create_model_card(self, model_path: str, dataset_path: str, training_config: Dict) -> str: |
|
|
"""Create a comprehensive model card""" |
|
|
|
|
|
|
|
|
dataset = load_from_disk(dataset_path) |
|
|
|
|
|
model_card = f"""# Prothom Alo Fine-tuned Language Model |
|
|
|
|
|
## Model Details |
|
|
|
|
|
- **Model Type**: Causal Language Model |
|
|
- **Base Model**: {self.model_name} |
|
|
- **Fine-tuned on**: Prothom Alo News Articles |
|
|
- **Languages**: English and Bengali |
|
|
- **Training Date**: {datetime.now().strftime('%Y-%m-%d')} |
|
|
|
|
|
## Model Description |
|
|
|
|
|
This model is fine-tuned on a curated dataset of Prothom Alo news articles, both English and Bengali content. The model has been trained to understand the writing style, topics, and language patterns of this major Bangladeshi news publication. |
|
|
|
|
|
## Training Data |
|
|
|
|
|
- **Source**: Prothom Alo news website (prothomalo.com, en.prothomalo.com) |
|
|
- **Total Articles**: {len(dataset['train']) + len(dataset['validation']) + len(dataset['test'])} |
|
|
- **Languages**: English and Bengali |
|
|
- **Categories**: News, Opinion, Politics, Business |
|
|
- **Dataset Splits**: |
|
|
- Training: {len(dataset['train'])} articles |
|
|
- Validation: {len(dataset['validation'])} articles |
|
|
- Test: {len(dataset['test'])} articles |
|
|
|
|
|
## Training Configuration |
|
|
|
|
|
```json |
|
|
{json.dumps(training_config, indent=2)} |
|
|
``` |
|
|
|
|
|
## Intended Uses & Limitations |
|
|
|
|
|
### Intended Uses |
|
|
- Text generation in the style of Prothom Alo news articles |
|
|
- Content generation for Bangladeshi news context |
|
|
- Research and educational purposes |
|
|
- Language model fine-tuning examples |
|
|
|
|
|
### Limitations |
|
|
- This model is trained on a limited dataset |
|
|
- May not generalize well to all news content |
|
|
- Should be used with caution for factual content |
|
|
- Requires human oversight for publication-quality content |
|
|
|
|
|
## Usage |
|
|
|
|
|
### Basic Text Generation |
|
|
|
|
|
```python |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
import torch |
|
|
|
|
|
# Load model |
|
|
tokenizer = AutoTokenizer.from_pretrained("./prothomalo_model") |
|
|
model = AutoModelForCausalLM.from_pretrained("./prothomalo_model") |
|
|
|
|
|
# Generate text |
|
|
prompt = "The latest news from Bangladesh shows" |
|
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
|
outputs = model.generate(**inputs, max_length=200, num_return_sequences=1) |
|
|
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
print(generated_text) |
|
|
``` |
|
|
|
|
|
### Using Safetensors Format |
|
|
|
|
|
```python |
|
|
from safetensors import safe_open |
|
|
import torch |
|
|
|
|
|
# Load model weights |
|
|
with safe_open("prothomalo_model.safetensors", framework="pt", device=0) as f: |
|
|
print(f"Available tensors: {len(f.keys())}") |
|
|
for key in list(f.keys())[:5]: # Show first 5 keys |
|
|
tensor = f.get_tensor(key) |
|
|
print(f"{key}: {tensor.shape}") |
|
|
``` |
|
|
|
|
|
## Ethical Considerations |
|
|
|
|
|
- This model is trained on publicly available news content |
|
|
- Should be used responsibly and ethically |
|
|
- Not intended for misinformation or harmful content generation |
|
|
- Please respect copyright and attribution when using generated content |
|
|
|
|
|
## Citation |
|
|
|
|
|
```bibtex |
|
|
@model{{prothom-alo-finetuned-2024, |
|
|
title={{Prothom Alo Fine-tuned Language Model}}, |
|
|
author={{MiniMax Agent}}, |
|
|
year={{2024}}, |
|
|
url={{https://huggingface.co/minimax/prothom-alo-model}} |
|
|
}} |
|
|
``` |
|
|
|
|
|
## License |
|
|
|
|
|
This model is released for research and educational purposes. Please ensure compliance with Prothom Alo's terms of service and copyright policies when using this model. |
|
|
""" |
|
|
|
|
|
model_card_path = f"{model_path}/MODEL_CARD.md" |
|
|
with open(model_card_path, 'w', encoding='utf-8') as f: |
|
|
f.write(model_card) |
|
|
|
|
|
logger.info(f"Model card created: {model_card_path}") |
|
|
return model_card_path |
|
|
|
|
|
def main(): |
|
|
"""Main training pipeline""" |
|
|
|
|
|
logger.info("π Prothom Alo Model Trainer") |
|
|
logger.info("=" * 50) |
|
|
|
|
|
|
|
|
config = { |
|
|
"model_name": "distilgpt2", |
|
|
"max_length": 512, |
|
|
"epochs": 3, |
|
|
"batch_size": 2, |
|
|
"learning_rate": 5e-5, |
|
|
"dataset_path": "./enhanced_prothomalo", |
|
|
"output_dir": "./prothomalo_model", |
|
|
"safetensors_output": "./prothomalo_model.safetensors" |
|
|
} |
|
|
|
|
|
try: |
|
|
|
|
|
trainer = ProthomAloModelTrainer( |
|
|
model_name=config["model_name"], |
|
|
max_length=config["max_length"] |
|
|
) |
|
|
|
|
|
|
|
|
train_dataset, val_dataset, raw_dataset = trainer.load_dataset(config["dataset_path"]) |
|
|
|
|
|
|
|
|
trainer.setup_model() |
|
|
|
|
|
|
|
|
model_path = trainer.train( |
|
|
train_dataset, |
|
|
val_dataset, |
|
|
output_dir=config["output_dir"], |
|
|
epochs=config["epochs"], |
|
|
batch_size=config["batch_size"], |
|
|
learning_rate=config["learning_rate"] |
|
|
) |
|
|
|
|
|
|
|
|
safetensors_path = trainer.convert_to_safetensors(model_path, config["safetensors_output"]) |
|
|
|
|
|
|
|
|
model_card_path = trainer.create_model_card(model_path, config["dataset_path"], config) |
|
|
|
|
|
|
|
|
inference_script = f"""#!/usr/bin/env python3 |
|
|
\"\"\" |
|
|
Prothom Alo Model Inference Script |
|
|
\"\"\" |
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
from safetensors import safe_open |
|
|
import torch |
|
|
import json |
|
|
|
|
|
def load_model_tokenizer(model_path: str): |
|
|
\"\"\"Load model and tokenizer\"\"\" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
model = AutoModelForCausalLM.from_pretrained(model_path) |
|
|
return tokenizer, model |
|
|
|
|
|
def load_safetensors_model(safetensors_path: str): |
|
|
\"\"\"Load model from Safetensors format\"\"\" |
|
|
with safe_open(safetensors_path, framework="pt", device=0) as f: |
|
|
print(f"Available tensors: {len(f.keys())}") |
|
|
for key in list(f.keys())[:5]: # Show first 5 keys |
|
|
tensor = f.get_tensor(key) |
|
|
print(f"{key}: {tensor.shape}") |
|
|
|
|
|
def generate_text(tokenizer, model, prompt: str, max_length: int = 200): |
|
|
\"\"\"Generate text from prompt\"\"\" |
|
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_length=max_length, |
|
|
num_return_sequences=1, |
|
|
do_sample=True, |
|
|
temperature=0.7, |
|
|
pad_token_id=tokenizer.eos_token_id |
|
|
) |
|
|
|
|
|
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
return generated_text |
|
|
|
|
|
if __name__ == "__main__": |
|
|
# Example usage |
|
|
model_path = "./prothomalo_model" |
|
|
safetensors_path = "./prothomalo_model.safetensors" |
|
|
|
|
|
# Load model |
|
|
print("Loading model...") |
|
|
tokenizer, model = load_model_tokenizer(model_path) |
|
|
|
|
|
# Test Safetensors |
|
|
print("Testing Safetensors format...") |
|
|
load_safetensors_model(safetensors_path) |
|
|
|
|
|
# Generate text |
|
|
prompt = "The latest news from Bangladesh indicates" |
|
|
print(f"\\nGenerating text for: {{prompt}}") |
|
|
|
|
|
generated = generate_text(tokenizer, model, prompt, max_length=150) |
|
|
print(f"Generated: {{generated}}") |
|
|
""" |
|
|
|
|
|
inference_path = f"{config['output_dir']}/inference.py" |
|
|
with open(inference_path, 'w', encoding='utf-8') as f: |
|
|
f.write(inference_script) |
|
|
|
|
|
|
|
|
logger.info(f"\nπ Model training and conversion completed!") |
|
|
logger.info(f"π Model directory: {model_path}") |
|
|
logger.info(f"π Safetensors file: {safetensors_path}") |
|
|
logger.info(f"π Model card: {model_card_path}") |
|
|
logger.info(f"π Inference script: {inference_path}") |
|
|
|
|
|
|
|
|
if os.path.exists(safetensors_path): |
|
|
size_mb = os.path.getsize(safetensors_path) / 1024 / 1024 |
|
|
logger.info(f"π Model size: {size_mb:.2f} MB") |
|
|
|
|
|
return { |
|
|
"model_path": model_path, |
|
|
"safetensors_path": safetensors_path, |
|
|
"model_card": model_card_path, |
|
|
"inference_script": inference_path, |
|
|
"config": config |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β Training failed: {e}") |
|
|
raise |
|
|
|
|
|
if __name__ == "__main__": |
|
|
result = main() |
|
|
print(f"\nβ
Training pipeline completed successfully!") |
|
|
print(f"π Model ready for use and distribution!") |