Upload folder using huggingface_hub
Browse files- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dialogsum_10.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dialogsum_20.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dialogsum_30.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dialogsum_40.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dialogsum_50.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dolly_10.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dolly_20.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dolly_30.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dolly_40.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dolly_50.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_self-inst_10.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_self-inst_20.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_self-inst_30.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_self-inst_40.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_self-inst_50.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_sinst_11__10.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_sinst_11__20.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_sinst_11__30.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_sinst_11__40.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_sinst_11__50.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_vicuna_10.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_vicuna_20.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_vicuna_30.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_vicuna_40.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_vicuna_50.jsonl +0 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/args.json +1 -1
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/log.txt +97 -0
- gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/rougeL_results.jsonl +25 -3
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dialogsum_10.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dialogsum_20.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dialogsum_30.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dialogsum_40.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dialogsum_50.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dolly_10.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dolly_20.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dolly_30.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dolly_40.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_dolly_50.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_self-inst_10.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_self-inst_20.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_self-inst_30.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_self-inst_40.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_self-inst_50.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_sinst_11__10.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_sinst_11__20.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_sinst_11__30.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_sinst_11__40.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_sinst_11__50.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_vicuna_10.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_vicuna_20.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_vicuna_30.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_vicuna_40.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/answers_vicuna_50.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/args.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"model_path": "/workspace/WCTKD/model_hub/gpt2/gpt2-xl", "ckpt_name": null, "model_type": "gpt2", "teacher_model_type": null, "n_gpu": 1, "n_nodes": 1, "teacher_model_path": null, "teacher_model_fp16": false, "model_parallel": false, "model_parallel_size": null, "no_value": false, "dropout_path_rate": null, "fp32": false, "model_dtype": "fp16", "M_global_path": null, "embedding_projection_path": null, "task": "eval_main", "do_train": false, "do_valid": false, "do_eval": true, "base_path": "/workspace/WCTKD", "load": null, "save_dir": "/workspace/WCTKD/outputs/gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001", "log_interval": 10, "save_interval": 1000, "eval_interval": 1000, "local_rank": 0, "save_additional_suffix": "", "save_rollout": false, "eb_sample_times": 3, "keep_best_n_checkpoints": 3, "criterion": "cross_entropy", "eval_tqdm": false, "report_logits": false, "only_save_projector": false, "debug": false, "data_dir": "/workspace/WCTKD/data/
|
|
|
|
| 1 |
+
{"model_path": "/workspace/WCTKD/model_hub/gpt2/gpt2-xl", "ckpt_name": null, "model_type": "gpt2", "teacher_model_type": null, "n_gpu": 1, "n_nodes": 1, "teacher_model_path": null, "teacher_model_fp16": false, "model_parallel": false, "model_parallel_size": null, "no_value": false, "dropout_path_rate": null, "fp32": false, "model_dtype": "fp16", "M_global_path": null, "embedding_projection_path": null, "task": "eval_main", "do_train": false, "do_valid": false, "do_eval": true, "base_path": "/workspace/WCTKD", "load": null, "save_dir": "/workspace/WCTKD/outputs/gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001", "log_interval": 10, "save_interval": 1000, "eval_interval": 1000, "local_rank": 0, "save_additional_suffix": "", "save_rollout": false, "eb_sample_times": 3, "keep_best_n_checkpoints": 3, "criterion": "cross_entropy", "eval_tqdm": false, "report_logits": false, "only_save_projector": false, "debug": false, "data_dir": "/workspace/WCTKD/data/vicuna", "processed_data_dir": null, "force_process": false, "force_process_demo": false, "data_process_workers": -1, "train_num": -1, "train_ratio": 1, "dev_num": -1, "dev_ratio": 1, "gen_num": -1, "data_names": "vicuna", "prompt_type": null, "num_workers": 0, "max_prompt_length": 256, "min_prompt_length": 128, "json_data": true, "bin_data": false, "txt_data": false, "prompt_data_dir": null, "pretrain_data_dir": null, "eval_ppl": false, "eval_rw": false, "eval_gen": false, "only_prompt": false, "batch_size": 32, "eval_batch_size": 16, "clip_grad": 1.0, "total_iters": null, "train_iters_per_epoch": -1, "max_length": 512, "seed": 50, "seed_order": 42, "seed_data": 42, "seed_ppo": 42, "seed_lm": 7, "num_epochs": null, "training_epochs": 10000, "gradient_accumulation_steps": 1, "gradient_checkpointing": false, "attn_dtype": null, "lr": null, "lr_min": 1e-07, "weight_decay": 0.01, "loss_scale": 65536, "kd_rate": 0.5, "kd_temperature": 1.0, "wctkd_alpha": 0.5, "wctkd_beta": 0.5, "wctkd_gamma": 0.5, "wctkd_hidden_gamma": 0.5, "wctkd_top_k": 8, "kd_objective": "forward_kl", "teacher_temperature": 1.0, "label_smoothing": 0.0, "adaptive_kl_alpha": 0.5, "skew_lambda": 0.1, "warmup_iters": 0, "lr_decay_iters": null, "lr_decay_style": "noam", "scheduler_name": "constant_trm", "top_k": 0, "top_p": 1.0, "do_sample": true, "no_repeat_ngram_size": 6, "repetition_penalty": null, "num_beams": 1, "temperature": 1.0, "eval_gen_repeat_times": 3, "peft": "lora", "peft_lora_r": 16, "peft_lora_alpha": 64, "peft_lora_dropout": 0.1, "peft_name": null, "peft_path": "/workspace/WCTKD/outputs/gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/epoch15_step21435_loss2.7989_rougel28.4516", "teacher_peft_name": null, "teacher_peft_path": null, "deepspeed": true, "deepspeed_config": "/workspace/WCTKD/configs/deepspeed/ds_config_bf16.json", "deepscale": false, "deepscale_config": null, "projector_config_path": null, "projector_path": null, "projector_lr": 0.001, "pretrained_projector": null, "pretrained_projector_lr": 0.001, "vocab_alignment_path": null, "teacher_to_student_token_mapping": null, "teacher_to_student_id_mapping": null, "student_to_teacher_token_mapping": null, "student_to_teacher_id_mapping": null, "rank": 0, "world_size": 1}
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/log.txt
CHANGED
|
@@ -116,3 +116,100 @@ test | name: vicuna | {'exact_match': 0.0, 'rougeL': 16.6115} | lm_loss 2.0626 |
|
|
| 116 |
|
| 117 |
|
| 118 |
============================== EXP at 2025-12-24 01:25:25 ==============================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
|
| 118 |
============================== EXP at 2025-12-24 01:25:25 ==============================
|
| 119 |
+
test | name: dolly | {'exact_match': 3.4, 'rougeL': 26.0349} | lm_loss 2.5585 | avg. gen lenth: 60.91 | seed 10
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
============================== EXP at 2025-12-24 01:29:25 ==============================
|
| 123 |
+
test | name: dolly | {'exact_match': 3.0, 'rougeL': 26.1929} | lm_loss 2.5585 | avg. gen lenth: 59.664 | seed 20
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
============================== EXP at 2025-12-24 01:33:16 ==============================
|
| 127 |
+
test | name: dolly | {'exact_match': 3.0, 'rougeL': 25.5445} | lm_loss 2.5585 | avg. gen lenth: 58.576 | seed 30
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
============================== EXP at 2025-12-24 01:37:02 ==============================
|
| 131 |
+
test | name: dolly | {'exact_match': 3.0, 'rougeL': 25.6587} | lm_loss 2.5585 | avg. gen lenth: 61.124 | seed 40
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
============================== EXP at 2025-12-24 01:40:39 ==============================
|
| 135 |
+
test | name: dolly | {'exact_match': 3.0, 'rougeL': 26.5581} | lm_loss 2.5585 | avg. gen lenth: 65.348 | seed 50
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
============================== EXP at 2025-12-24 01:44:43 ==============================
|
| 139 |
+
test | name: sinst/11_ | {'exact_match': 0.059, 'rougeL': 26.7044} | lm_loss 4.2191 | avg. gen lenth: 38.44923258559622 | seed 10
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
============================== EXP at 2025-12-24 01:54:04 ==============================
|
| 143 |
+
test | name: sinst/11_ | {'exact_match': 0.0, 'rougeL': 26.1725} | lm_loss 4.2191 | avg. gen lenth: 38.03719008264463 | seed 20
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
============================== EXP at 2025-12-24 02:02:58 ==============================
|
| 147 |
+
test | name: sinst/11_ | {'exact_match': 0.059, 'rougeL': 26.5216} | lm_loss 4.2191 | avg. gen lenth: 39.32762691853601 | seed 30
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
============================== EXP at 2025-12-24 02:12:30 ==============================
|
| 151 |
+
test | name: sinst/11_ | {'exact_match': 0.0, 'rougeL': 26.9048} | lm_loss 4.2191 | avg. gen lenth: 38.23258559622196 | seed 40
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
============================== EXP at 2025-12-24 02:21:28 ==============================
|
| 155 |
+
test | name: sinst/11_ | {'exact_match': 0.1181, 'rougeL': 26.531} | lm_loss 4.2191 | avg. gen lenth: 39.62219598583235 | seed 50
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
============================== EXP at 2025-12-24 02:31:15 ==============================
|
| 159 |
+
test | name: self-inst | {'exact_match': 1.2397, 'rougeL': 16.0528} | lm_loss 3.4319 | avg. gen lenth: 78.46280991735537 | seed 10
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
============================== EXP at 2025-12-24 02:33:36 ==============================
|
| 163 |
+
test | name: self-inst | {'exact_match': 1.2397, 'rougeL': 16.0542} | lm_loss 3.4319 | avg. gen lenth: 76.2603305785124 | seed 20
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
============================== EXP at 2025-12-24 02:35:56 ==============================
|
| 167 |
+
test | name: self-inst | {'exact_match': 1.2397, 'rougeL': 15.7347} | lm_loss 3.4319 | avg. gen lenth: 69.29752066115702 | seed 30
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
============================== EXP at 2025-12-24 02:38:12 ==============================
|
| 171 |
+
test | name: self-inst | {'exact_match': 1.2397, 'rougeL': 15.6599} | lm_loss 3.4319 | avg. gen lenth: 72.7603305785124 | seed 40
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
============================== EXP at 2025-12-24 02:40:20 ==============================
|
| 175 |
+
test | name: self-inst | {'exact_match': 0.8264, 'rougeL': 15.3959} | lm_loss 3.4319 | avg. gen lenth: 73.81404958677686 | seed 50
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
============================== EXP at 2025-12-24 02:42:34 ==============================
|
| 179 |
+
test | name: dialogsum | {'exact_match': 0.0, 'rougeL': 12.1416} | lm_loss nan | avg. gen lenth: 95.612 | seed 10
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
============================== EXP at 2025-12-24 02:56:32 ==============================
|
| 183 |
+
test | name: dialogsum | {'exact_match': 0.0, 'rougeL': 12.3286} | lm_loss nan | avg. gen lenth: 93.00066666666666 | seed 20
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
============================== EXP at 2025-12-24 03:10:17 ==============================
|
| 187 |
+
test | name: dialogsum | {'exact_match': 0.0, 'rougeL': 11.9797} | lm_loss nan | avg. gen lenth: 93.68733333333333 | seed 30
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
============================== EXP at 2025-12-24 03:24:07 ==============================
|
| 191 |
+
test | name: dialogsum | {'exact_match': 0.0, 'rougeL': 12.1308} | lm_loss nan | avg. gen lenth: 97.186 | seed 40
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
============================== EXP at 2025-12-24 03:38:05 ==============================
|
| 195 |
+
test | name: dialogsum | {'exact_match': 0.0, 'rougeL': 12.1682} | lm_loss nan | avg. gen lenth: 97.30133333333333 | seed 50
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
============================== EXP at 2025-12-24 03:52:00 ==============================
|
| 199 |
+
test | name: vicuna | {'exact_match': 0.0, 'rougeL': 17.0913} | lm_loss 2.0758 | avg. gen lenth: 131.475 | seed 10
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
============================== EXP at 2025-12-24 03:52:57 ==============================
|
| 203 |
+
test | name: vicuna | {'exact_match': 0.0, 'rougeL': 16.1359} | lm_loss 2.0758 | avg. gen lenth: 120.2125 | seed 20
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
============================== EXP at 2025-12-24 03:53:51 ==============================
|
| 207 |
+
test | name: vicuna | {'exact_match': 0.0, 'rougeL': 15.9061} | lm_loss 2.0758 | avg. gen lenth: 122.5875 | seed 30
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
============================== EXP at 2025-12-24 03:54:49 ==============================
|
| 211 |
+
test | name: vicuna | {'exact_match': 0.0, 'rougeL': 16.7801} | lm_loss 2.0758 | avg. gen lenth: 124.475 | seed 40
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
============================== EXP at 2025-12-24 03:55:47 ==============================
|
| 215 |
+
test | name: vicuna | {'exact_match': 0.0, 'rougeL': 16.7056} | lm_loss 2.0758 | avg. gen lenth: 119.975 | seed 50
|
gpt2/gpt2-xl/wctkd/criterion=wctkd__forward_kl-lora-rank=256-alpha=8-dropout=0.1-bf16__teacher=qwen__kd^rate=0.5__kd^temp=2.0__wctkd^alpha=0.5__wctkd^beta=0.2__wctkd^gamma=0.3__wctkd^hidden_gamma=0.5__wctkd^top_k=16__epoch=15__bsz=4x2x1=8__lr=0.001/rougeL_results.jsonl
CHANGED
|
@@ -1,6 +1,3 @@
|
|
| 1 |
-
// {"dataname": "dolly", "seed": 10, "rougeL": 25.5657}
|
| 2 |
-
// {"dataname": "dolly", "seed": 20, "rougeL": 25.2568}
|
| 3 |
-
// {"dataname": "dolly", "seed": 30, "rougeL": 25.893}
|
| 4 |
// {"dataname": "dolly", "seed": 10, "rougeL": 25.9126}
|
| 5 |
// {"dataname": "dolly", "seed": 20, "rougeL": 25.8155}
|
| 6 |
// {"dataname": "dolly", "seed": 30, "rougeL": 26.0073}
|
|
@@ -26,3 +23,28 @@
|
|
| 26 |
// {"dataname": "vicuna", "seed": 30, "rougeL": 15.9872}
|
| 27 |
// {"dataname": "vicuna", "seed": 40, "rougeL": 16.6899}
|
| 28 |
// {"dataname": "vicuna", "seed": 50, "rougeL": 16.6115}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
// {"dataname": "dolly", "seed": 10, "rougeL": 25.9126}
|
| 2 |
// {"dataname": "dolly", "seed": 20, "rougeL": 25.8155}
|
| 3 |
// {"dataname": "dolly", "seed": 30, "rougeL": 26.0073}
|
|
|
|
| 23 |
// {"dataname": "vicuna", "seed": 30, "rougeL": 15.9872}
|
| 24 |
// {"dataname": "vicuna", "seed": 40, "rougeL": 16.6899}
|
| 25 |
// {"dataname": "vicuna", "seed": 50, "rougeL": 16.6115}
|
| 26 |
+
{"dataname": "dolly", "seed": 10, "rougeL": 26.0349}
|
| 27 |
+
{"dataname": "dolly", "seed": 20, "rougeL": 26.1929}
|
| 28 |
+
{"dataname": "dolly", "seed": 30, "rougeL": 25.5445}
|
| 29 |
+
{"dataname": "dolly", "seed": 40, "rougeL": 25.6587}
|
| 30 |
+
{"dataname": "dolly", "seed": 50, "rougeL": 26.5581}
|
| 31 |
+
{"dataname": "sinst_11_", "seed": 10, "rougeL": 26.7044}
|
| 32 |
+
{"dataname": "sinst_11_", "seed": 20, "rougeL": 26.1725}
|
| 33 |
+
{"dataname": "sinst_11_", "seed": 30, "rougeL": 26.5216}
|
| 34 |
+
{"dataname": "sinst_11_", "seed": 40, "rougeL": 26.9048}
|
| 35 |
+
{"dataname": "sinst_11_", "seed": 50, "rougeL": 26.531}
|
| 36 |
+
{"dataname": "self-inst", "seed": 10, "rougeL": 16.0528}
|
| 37 |
+
{"dataname": "self-inst", "seed": 20, "rougeL": 16.0542}
|
| 38 |
+
{"dataname": "self-inst", "seed": 30, "rougeL": 15.7347}
|
| 39 |
+
{"dataname": "self-inst", "seed": 40, "rougeL": 15.6599}
|
| 40 |
+
{"dataname": "self-inst", "seed": 50, "rougeL": 15.3959}
|
| 41 |
+
{"dataname": "dialogsum", "seed": 10, "rougeL": 12.1416}
|
| 42 |
+
{"dataname": "dialogsum", "seed": 20, "rougeL": 12.3286}
|
| 43 |
+
{"dataname": "dialogsum", "seed": 30, "rougeL": 11.9797}
|
| 44 |
+
{"dataname": "dialogsum", "seed": 40, "rougeL": 12.1308}
|
| 45 |
+
{"dataname": "dialogsum", "seed": 50, "rougeL": 12.1682}
|
| 46 |
+
{"dataname": "vicuna", "seed": 10, "rougeL": 17.0913}
|
| 47 |
+
{"dataname": "vicuna", "seed": 20, "rougeL": 16.1359}
|
| 48 |
+
{"dataname": "vicuna", "seed": 30, "rougeL": 15.9061}
|
| 49 |
+
{"dataname": "vicuna", "seed": 40, "rougeL": 16.7801}
|
| 50 |
+
{"dataname": "vicuna", "seed": 50, "rougeL": 16.7056}
|