| import sys, os, zipfile, shutil, time, traceback, threading, uvicorn |
| from fastapi import FastAPI |
| from fastapi.responses import JSONResponse |
| from datetime import datetime |
| from datasets import load_dataset |
| from huggingface_hub import HfApi |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling |
| from peft import get_peft_model, LoraConfig, TaskType |
| import torch |
|
|
| |
| START_NUMBER = 0 |
| END_NUMBER = 9 |
| MODEL_NAME = "TURKCELL/Turkcell-LLM-7b-v1" |
| TOKENIZED_DATASET_ID = "UcsTurkey/turkish-train-tokenized" |
| ZIP_UPLOAD_REPO = "UcsTurkey/trained-zips" |
| HF_TOKEN = os.environ.get("HF_TOKEN") |
| BATCH_SIZE = 1 |
| EPOCHS = 2 |
| MAX_LENGTH = 2048 |
| OUTPUT_DIR = "/data/output" |
| ZIP_FOLDER = "/data/zip_temp" |
| zip_name = f"trained_model_{START_NUMBER:03d}_{END_NUMBER:03d}.zip" |
| ZIP_PATH = os.path.join(ZIP_FOLDER, zip_name) |
|
|
| |
| app = FastAPI() |
|
|
| @app.get("/") |
| def health(): |
| return JSONResponse(content={"status": "ok"}) |
|
|
| def run_health_server(): |
| uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|
| threading.Thread(target=run_health_server, daemon=True).start() |
|
|
| |
| def log(message): |
| timestamp = datetime.now().strftime("%H:%M:%S") |
| print(f"[{timestamp}] {message}") |
| sys.stdout.flush() |
|
|
| |
| log("🛠️ Ortam hazırlanıyor...") |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| log("🧠 Model indiriliyor...") |
| base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16) |
| base_model.config.pad_token_id = tokenizer.pad_token_id |
|
|
| log("🎯 LoRA adapter uygulanıyor...") |
| peft_config = LoraConfig( |
| task_type=TaskType.CAUSAL_LM, |
| r=64, |
| lora_alpha=16, |
| lora_dropout=0.1, |
| bias="none", |
| fan_in_fan_out=False |
| ) |
| model = get_peft_model(base_model, peft_config) |
| model.print_trainable_parameters() |
|
|
| log("📦 Parquet dosyaları listeleniyor...") |
| api = HfApi() |
| files = api.list_repo_files(repo_id=TOKENIZED_DATASET_ID, repo_type="dataset", token=HF_TOKEN) |
| selected_files = sorted([f for f in files if f.startswith("chunk_") and f.endswith(".parquet")])[START_NUMBER:END_NUMBER+1] |
|
|
| if not selected_files: |
| log("⚠️ Parquet bulunamadı. Eğitim iptal.") |
| exit(0) |
|
|
| training_args = TrainingArguments( |
| output_dir=OUTPUT_DIR, |
| per_device_train_batch_size=BATCH_SIZE, |
| num_train_epochs=EPOCHS, |
| save_strategy="epoch", |
| save_total_limit=2, |
| learning_rate=2e-4, |
| disable_tqdm=True, |
| logging_strategy="steps", |
| logging_steps=10, |
| report_to=[], |
| bf16=True, |
| fp16=False |
| ) |
|
|
| collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
| for file in selected_files: |
| try: |
| log(f"\n📄 Yükleniyor: {file}") |
| dataset = load_dataset( |
| path=TOKENIZED_DATASET_ID, |
| data_files={"train": file}, |
| split="train", |
| token=HF_TOKEN |
| ) |
| log(f"🔍 {len(dataset)} örnek") |
| if len(dataset) == 0: |
| continue |
|
|
| |
| |
| first_row = dataset[0] |
| decoded_prompt = tokenizer.decode(first_row["input_ids"], skip_special_tokens=True) |
| log(f"📌 Örnek prompt: {decoded_prompt[:200]}...") |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=dataset, |
| data_collator=collator |
| ) |
| log("🚀 Eğitim başlıyor...") |
| trainer.train() |
| log("✅ Eğitim tamam.") |
| except Exception as e: |
| log(f"❌ Hata: {file} → {e}") |
| traceback.print_exc() |
|
|
| |
| log("📦 Model zipleniyor...") |
| try: |
| tmp_dir = os.path.join(ZIP_FOLDER, "temp_save") |
| os.makedirs(tmp_dir, exist_ok=True) |
| model.save_pretrained(tmp_dir) |
| tokenizer.save_pretrained(tmp_dir) |
|
|
| with zipfile.ZipFile(ZIP_PATH, "w", zipfile.ZIP_DEFLATED) as zipf: |
| for root, _, files in os.walk(tmp_dir): |
| for file in files: |
| filepath = os.path.join(root, file) |
| arcname = os.path.relpath(filepath, tmp_dir) |
| zipf.write(filepath, arcname=os.path.join("output", arcname)) |
| log(f"✅ Zip oluşturuldu: {ZIP_PATH}") |
| except Exception as e: |
| log(f"❌ Zipleme hatası: {e}") |
| traceback.print_exc() |
|
|
| |
| try: |
| log("☁️ Hugging Face'e yükleniyor...") |
| api.upload_file( |
| path_or_fileobj=ZIP_PATH, |
| path_in_repo=zip_name, |
| repo_id=ZIP_UPLOAD_REPO, |
| repo_type="model", |
| token=HF_TOKEN |
| ) |
| log("✅ Upload tamam.") |
| except Exception as e: |
| log(f"❌ Upload hatası: {e}") |
| traceback.print_exc() |
|
|
| log("⏸️ Eğitim tamamlandı. Servis bekleme modunda...") |
| while True: |
| time.sleep(60) |
|
|