| | |
| | """ |
| | Script pour créer un modèle HuggingFace fonctionnel à partir de zéro |
| | ou adapter un modèle existant |
| | """ |
| |
|
| | from transformers import ( |
| | AutoTokenizer, |
| | AutoModelForCausalLM, |
| | AutoConfig, |
| | GPT2LMHeadModel, |
| | GPT2Tokenizer |
| | ) |
| | import torch |
| | import json |
| |
|
| | def create_basic_model(): |
| | """Crée un modèle de base fonctionnel""" |
| | |
| | |
| | config = AutoConfig.from_pretrained("gpt2") |
| | config.vocab_size = 50257 |
| | config.n_positions = 1024 |
| | config.n_ctx = 1024 |
| | config.n_embd = 768 |
| | config.n_layer = 12 |
| | config.n_head = 12 |
| | |
| | |
| | model = GPT2LMHeadModel(config) |
| | |
| | |
| | tokenizer = GPT2Tokenizer.from_pretrained("gpt2") |
| | tokenizer.pad_token = tokenizer.eos_token |
| | |
| | return model, tokenizer, config |
| |
|
| | def save_complete_model(model, tokenizer, config, save_path="./Tenro_V4.1_complete"): |
| | """Sauvegarde complète avec tous les fichiers requis""" |
| | |
| | |
| | model.save_pretrained(save_path) |
| | tokenizer.save_pretrained(save_path) |
| | |
| | |
| | generation_config = { |
| | "_from_model_config": True, |
| | "bos_token_id": tokenizer.bos_token_id or tokenizer.eos_token_id, |
| | "eos_token_id": tokenizer.eos_token_id, |
| | "max_length": 1024, |
| | "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id, |
| | "do_sample": True, |
| | "temperature": 0.7, |
| | "top_p": 0.9, |
| | "repetition_penalty": 1.1 |
| | } |
| | |
| | with open(f"{save_path}/generation_config.json", "w") as f: |
| | json.dump(generation_config, f, indent=2) |
| | |
| | print(f"Modèle complet sauvegardé dans: {save_path}") |
| | print("Fichiers créés:") |
| | import os |
| | for file in os.listdir(save_path): |
| | size = os.path.getsize(f"{save_path}/{file}") |
| | print(f" - {file}: {size/1024:.1f} KB") |
| |
|
| | def upload_to_huggingface(model, tokenizer, repo_name): |
| | """Upload vers HuggingFace Hub""" |
| | try: |
| | |
| | model.push_to_hub(repo_name) |
| | tokenizer.push_to_hub(repo_name) |
| | print(f"Modèle uploadé vers: https://huggingface.co/{repo_name}") |
| | except Exception as e: |
| | print(f"Erreur upload: {e}") |
| | print("Assurez-vous d'être connecté avec: huggingface-cli login") |
| |
|
| | def load_your_existing_data(file_path): |
| | """Charge vos données existantes si possible""" |
| | try: |
| | |
| | if file_path.endswith('.json'): |
| | with open(file_path, 'r') as f: |
| | data = json.load(f) |
| | return data |
| | elif file_path.endswith('.bin'): |
| | data = torch.load(file_path, map_location='cpu') |
| | return data |
| | else: |
| | with open(file_path, 'r') as f: |
| | content = f.read() |
| | return content |
| | except Exception as e: |
| | print(f"Impossible de charger {file_path}: {e}") |
| | return None |
| |
|
| | if __name__ == "__main__": |
| | print("🚀 Création d'un modèle HuggingFace complet...") |
| | |
| | |
| | model, tokenizer, config = create_basic_model() |
| | |
| | |
| | your_file = "Tenro_V4.1.1" |
| | existing_data = load_your_existing_data(your_file) |
| | if existing_data: |
| | print("✅ Données existantes chargées") |
| | |
| | |
| | |
| | save_complete_model(model, tokenizer, config) |
| | |
| | |
| | print("\n🧪 Test du modèle:") |
| | inputs = tokenizer("Hello, I am Tenro", return_tensors="pt") |
| | with torch.no_grad(): |
| | outputs = model.generate( |
| | inputs.input_ids, |
| | max_length=50, |
| | do_sample=True, |
| | temperature=0.7, |
| | pad_token_id=tokenizer.eos_token_id |
| | ) |
| | |
| | generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | print(f"Texte généré: {generated_text}") |
| | |
| | |
| | upload_choice = input("\nVoulez-vous uploader vers HuggingFace? (y/n): ") |
| | if upload_choice.lower() == 'y': |
| | repo_name = input("Nom du repo (ex: votre_nom/Tenro_V4.1): ") |
| | upload_to_huggingface(model, tokenizer, repo_name) |
| | |
| | print("\n✅ Terminé! Vous avez maintenant un modèle complet et fonctionnel.") |