#!/usr/bin/env python3 """ Prothom Alo Language Model Trainer Fine-tunes a small language model on Prothom Alo news articles Converts to Safetensors format for distribution """ import os import torch import json import logging from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Tuple # ML libraries from datasets import load_from_disk from transformers import ( AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, DataCollator ) from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING from torch.utils.data import Dataset import safetensors.torch from safetensors import safe_open # Setup logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class ProthomAloDataset(Dataset): """Custom dataset for Prothom Alo content""" def __init__(self, dataset, tokenizer, max_length: int = 512): self.dataset = dataset self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.dataset) def __getitem__(self, idx): item = self.dataset[idx] # Combine title and content for training text = f"Title: {item['title']}\n\nContent: {item['content_clean']}" # Tokenize with proper truncation encoding = self.tokenizer( text, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt" ) return { "input_ids": encoding["input_ids"].squeeze(), "attention_mask": encoding["attention_mask"].squeeze(), "labels": encoding["input_ids"].squeeze() } class ProthomAloModelTrainer: """Trainer class for Prothom Alo model fine-tuning""" def __init__(self, model_name: str = "distilgpt2", max_length: int = 512): self.model_name = model_name self.max_length = max_length self.tokenizer = None self.model = None self.device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {self.device}") def load_dataset(self, dataset_path: str) -> Tuple[any, any, any]: """Load and prepare the Prothom Alo dataset""" logger.info(f"Loading dataset from: {dataset_path}") dataset = load_from_disk(dataset_path) logger.info(f"Dataset loaded: {len(dataset['train'])} train, {len(dataset['validation'])} validation, {len(dataset['test'])} test") # Load tokenizer logger.info(f"Loading tokenizer: {self.model_name}") self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) # Set pad token for language modeling if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token # Create custom datasets train_dataset = ProthomAloDataset(dataset['train'], self.tokenizer, self.max_length) val_dataset = ProthomAloDataset(dataset['validation'], self.tokenizer, self.max_length) return train_dataset, val_dataset, dataset def setup_model(self) -> None: """Setup the model for training""" logger.info(f"Loading model: {self.model_name}") # Use a small efficient model self.model = AutoModelForCausalLM.from_pretrained( self.model_name, torch_dtype=torch.float32, device_map="auto" if torch.cuda.is_available() else None ) # Resize embeddings to account for new tokens if needed self.model.resize_token_embeddings(len(self.tokenizer)) # Enable gradient checkpointing for memory efficiency if hasattr(self.model, 'gradient_checkpointing_enable'): self.model.gradient_checkpointing_enable() logger.info(f"Model loaded with {self.model.num_parameters():,} parameters") def train(self, train_dataset, val_dataset, output_dir: str = "prothomalo_model", epochs: int = 3, batch_size: int = 2, learning_rate: float = 5e-5) -> str: """Train the model on Prothom Alo dataset""" logger.info(f"Starting training for {epochs} epochs") logger.info(f"Training config: batch_size={batch_size}, learning_rate={learning_rate}") # Training arguments optimized for small datasets training_args = TrainingArguments( output_dir=output_dir, num_train_epochs=epochs, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, learning_rate=learning_rate, weight_decay=0.01, logging_steps=1, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="eval_loss", save_total_limit=2, report_to="none", # Disable wandb/tensorboard dataloader_num_workers=0, # Avoid multiprocessing issues warmup_steps=10, max_grad_norm=1.0, fp16=False, # Avoid precision issues with small models ) # Data collator data_collator = DataCollatorForLanguageModeling( tokenizer=self.tokenizer, mlm=False, # We're doing causal LM, not masked LM ) # Initialize trainer trainer = Trainer( model=self.model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, data_collator=data_collator, ) # Train the model logger.info("Starting training...") trainer.train() # Save the final model model_path = f"{output_dir}/final_model" trainer.save_model(model_path) self.tokenizer.save_pretrained(model_path) logger.info(f"Model training completed! Saved to: {model_path}") return model_path def convert_to_safetensors(self, model_path: str, output_path: str = "prothomalo_model.safetensors") -> str: """Convert the fine-tuned model to Safetensors format""" logger.info(f"Converting model to Safetensors format: {output_path}") # Load the model model = AutoModelForCausalLM.from_pretrained(model_path) # Get model state dict state_dict = model.state_dict() # Fix shared tensors issue by making a deep copy # In transformer models, lm_head.weight and transformer.wte.weight often share memory # We need to handle this properly for Safetensors for key in list(state_dict.keys()): if 'lm_head.weight' in key: # Make a copy to avoid shared memory issues state_dict[key] = state_dict[key].clone() # Save as Safetensors safetensors.torch.save_file(state_dict, output_path, metadata={"format": "pt"}) logger.info(f"Model converted to Safetensors: {output_path}") logger.info(f"File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB") # Test loading with safe_open(output_path, framework="pt", device=0) as f: keys = list(f.keys()) logger.info(f"Safetensors contains {len(keys)} tensors") return output_path def create_model_card(self, model_path: str, dataset_path: str, training_config: Dict) -> str: """Create a comprehensive model card""" # Load dataset info dataset = load_from_disk(dataset_path) model_card = f"""# Prothom Alo Fine-tuned Language Model ## Model Details - **Model Type**: Causal Language Model - **Base Model**: {self.model_name} - **Fine-tuned on**: Prothom Alo News Articles - **Languages**: English and Bengali - **Training Date**: {datetime.now().strftime('%Y-%m-%d')} ## Model Description This model is fine-tuned on a curated dataset of Prothom Alo news articles, both English and Bengali content. The model has been trained to understand the writing style, topics, and language patterns of this major Bangladeshi news publication. ## Training Data - **Source**: Prothom Alo news website (prothomalo.com, en.prothomalo.com) - **Total Articles**: {len(dataset['train']) + len(dataset['validation']) + len(dataset['test'])} - **Languages**: English and Bengali - **Categories**: News, Opinion, Politics, Business - **Dataset Splits**: - Training: {len(dataset['train'])} articles - Validation: {len(dataset['validation'])} articles - Test: {len(dataset['test'])} articles ## Training Configuration ```json {json.dumps(training_config, indent=2)} ``` ## Intended Uses & Limitations ### Intended Uses - Text generation in the style of Prothom Alo news articles - Content generation for Bangladeshi news context - Research and educational purposes - Language model fine-tuning examples ### Limitations - This model is trained on a limited dataset - May not generalize well to all news content - Should be used with caution for factual content - Requires human oversight for publication-quality content ## Usage ### Basic Text Generation ```python from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Load model tokenizer = AutoTokenizer.from_pretrained("./prothomalo_model") model = AutoModelForCausalLM.from_pretrained("./prothomalo_model") # Generate text prompt = "The latest news from Bangladesh shows" inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_length=200, num_return_sequences=1) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) print(generated_text) ``` ### Using Safetensors Format ```python from safetensors import safe_open import torch # Load model weights with safe_open("prothomalo_model.safetensors", framework="pt", device=0) as f: print(f"Available tensors: {len(f.keys())}") for key in list(f.keys())[:5]: # Show first 5 keys tensor = f.get_tensor(key) print(f"{key}: {tensor.shape}") ``` ## Ethical Considerations - This model is trained on publicly available news content - Should be used responsibly and ethically - Not intended for misinformation or harmful content generation - Please respect copyright and attribution when using generated content ## Citation ```bibtex @model{{prothom-alo-finetuned-2024, title={{Prothom Alo Fine-tuned Language Model}}, author={{MiniMax Agent}}, year={{2024}}, url={{https://huggingface.co/minimax/prothom-alo-model}} }} ``` ## License This model is released for research and educational purposes. Please ensure compliance with Prothom Alo's terms of service and copyright policies when using this model. """ model_card_path = f"{model_path}/MODEL_CARD.md" with open(model_card_path, 'w', encoding='utf-8') as f: f.write(model_card) logger.info(f"Model card created: {model_card_path}") return model_card_path def main(): """Main training pipeline""" logger.info("šŸš€ Prothom Alo Model Trainer") logger.info("=" * 50) # Configuration config = { "model_name": "distilgpt2", # Small, efficient model "max_length": 512, "epochs": 3, "batch_size": 2, "learning_rate": 5e-5, "dataset_path": "./enhanced_prothomalo", "output_dir": "./prothomalo_model", "safetensors_output": "./prothomalo_model.safetensors" } try: # Initialize trainer trainer = ProthomAloModelTrainer( model_name=config["model_name"], max_length=config["max_length"] ) # Load dataset train_dataset, val_dataset, raw_dataset = trainer.load_dataset(config["dataset_path"]) # Setup model trainer.setup_model() # Train model model_path = trainer.train( train_dataset, val_dataset, output_dir=config["output_dir"], epochs=config["epochs"], batch_size=config["batch_size"], learning_rate=config["learning_rate"] ) # Convert to Safetensors safetensors_path = trainer.convert_to_safetensors(model_path, config["safetensors_output"]) # Create model card model_card_path = trainer.create_model_card(model_path, config["dataset_path"], config) # Create inference script inference_script = f"""#!/usr/bin/env python3 \"\"\" Prothom Alo Model Inference Script \"\"\" from transformers import AutoTokenizer, AutoModelForCausalLM from safetensors import safe_open import torch import json def load_model_tokenizer(model_path: str): \"\"\"Load model and tokenizer\"\"\" tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained(model_path) return tokenizer, model def load_safetensors_model(safetensors_path: str): \"\"\"Load model from Safetensors format\"\"\" with safe_open(safetensors_path, framework="pt", device=0) as f: print(f"Available tensors: {len(f.keys())}") for key in list(f.keys())[:5]: # Show first 5 keys tensor = f.get_tensor(key) print(f"{key}: {tensor.shape}") def generate_text(tokenizer, model, prompt: str, max_length: int = 200): \"\"\"Generate text from prompt\"\"\" inputs = tokenizer(prompt, return_tensors="pt") with torch.no_grad(): outputs = model.generate( **inputs, max_length=max_length, num_return_sequences=1, do_sample=True, temperature=0.7, pad_token_id=tokenizer.eos_token_id ) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return generated_text if __name__ == "__main__": # Example usage model_path = "./prothomalo_model" safetensors_path = "./prothomalo_model.safetensors" # Load model print("Loading model...") tokenizer, model = load_model_tokenizer(model_path) # Test Safetensors print("Testing Safetensors format...") load_safetensors_model(safetensors_path) # Generate text prompt = "The latest news from Bangladesh indicates" print(f"\\nGenerating text for: {{prompt}}") generated = generate_text(tokenizer, model, prompt, max_length=150) print(f"Generated: {{generated}}") """ inference_path = f"{config['output_dir']}/inference.py" with open(inference_path, 'w', encoding='utf-8') as f: f.write(inference_script) # Summary logger.info(f"\nšŸŽ‰ Model training and conversion completed!") logger.info(f"šŸ“ Model directory: {model_path}") logger.info(f"šŸ”’ Safetensors file: {safetensors_path}") logger.info(f"šŸ“– Model card: {model_card_path}") logger.info(f"šŸš€ Inference script: {inference_path}") # File sizes if os.path.exists(safetensors_path): size_mb = os.path.getsize(safetensors_path) / 1024 / 1024 logger.info(f"šŸ“Š Model size: {size_mb:.2f} MB") return { "model_path": model_path, "safetensors_path": safetensors_path, "model_card": model_card_path, "inference_script": inference_path, "config": config } except Exception as e: logger.error(f"āŒ Training failed: {e}") raise if __name__ == "__main__": result = main() print(f"\nāœ… Training pipeline completed successfully!") print(f"šŸš€ Model ready for use and distribution!")