File size: 2,761 Bytes
00f6d1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dd9831
 
 
 
 
 
 
 
 
 
00f6d1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
{
  "repo_id": "faeea/custom-gopt-252-eval",
  "description": "Bundle of the locally downloaded best validation Streaming GOPT checkpoint plus the Whisper and Charsiu models required by the evaluation pipeline.",
  "artifacts": [
    {
      "path": "streaming_gopt_best/best_audio_model.pth",
      "type": "streaming_gopt_weights",
      "purpose": "Pronunciation scoring model selected by validation phone MSE."
    },
    {
      "path": "streaming_gopt_best/config.json",
      "type": "streaming_gopt_config",
      "purpose": "Network shape and training arguments used to restore the model."
    },
    {
      "path": "streaming_gopt_best/result.csv",
      "type": "training_metrics",
      "purpose": "Per-epoch train/validation metrics."
    },
    {
      "path": "streaming_gopt_best/test_metrics.json",
      "type": "test_metrics",
      "purpose": "Held-out test metrics for the best validation checkpoint."
    },
    {
      "path": "whisper_best_model",
      "type": "transformers_whisper_model",
      "purpose": "ASR model used to build streaming ASR-driven GOPT chunks."
    },
    {
      "path": "charsiu_en_w2v2_tiny_fc_10ms",
      "type": "charsiu_aligner_model",
      "purpose": "Frame-level phone alignment model used by preprocessing."
    },
    {
      "path": "examples/eval_streaming_gopt_test.py",
      "type": "example_script",
      "purpose": "Minimal evaluation script for val/test split using the bundled best GOPT checkpoint."
    },
    {
      "path": "examples/infer_one_audio.py",
      "type": "example_script",
      "purpose": "Minimal one-audio local inference script that prints the overall utterance score."
    },
    {
      "path": "streaming_gopt_best/inference_assets.json",
      "type": "inference_metadata",
      "purpose": "Normalization stats and phone-id mapping required for one-audio local inference."
    }
  ],
  "best_validation_summary": {
    "selection_metric": "phone_val_mse",
    "best_epoch": 15,
    "phone_val_mse": 0.04897475987672806,
    "phone_val_pcc": 0.24482591936290432,
    "utt_val_pcc_total": 0.6696266195414817,
    "word_val_pcc_total": 0.24699408632096473
  },
  "test_summary": {
    "phone_test_mse": 0.04749840870499611,
    "phone_test_pcc": 0.3332444625995981,
    "utt_test_pcc": [
      0.6184778675115561,
      -0.005628494483717365,
      0.7233702728305461,
      0.7387418272039076,
      0.6823243104620896
    ],
    "word_test_pcc": [
      0.3099214468824142,
      0.022433912396827224,
      0.321762854413528
    ]
  },
  "code_dependencies": {
    "custom_gopt_repo": "https://github.com/hf49w/custom-gopt.git",
    "charsiu_repo": "https://github.com/lingjzhu/charsiu",
    "charsiu_repo_commit": "13a69f2a22ca0c0962b75cc693399b0ae23a12c9"
  }
}