aweilo commited on
Commit
68de334
·
verified ·
1 Parent(s): 004f5ef

Upload 29 files

Browse files
Files changed (29) hide show
  1. pretrained_model/Put pre-trained weights here.txt +0 -0
  2. pretrained_model/audio2mesh.pt +3 -0
  3. pretrained_model/denoising_unet.pth +3 -0
  4. pretrained_model/image_encoder/config.json +23 -0
  5. pretrained_model/image_encoder/pytorch_model.bin +3 -0
  6. pretrained_model/motion_module.pth +3 -0
  7. pretrained_model/pose_guider.pth +3 -0
  8. pretrained_model/reference_unet.pth +3 -0
  9. pretrained_model/sd-vae-ft-mse/.gitattributes +33 -0
  10. pretrained_model/sd-vae-ft-mse/README.md +83 -0
  11. pretrained_model/sd-vae-ft-mse/config.json +29 -0
  12. pretrained_model/sd-vae-ft-mse/diffusion_pytorch_model.bin +3 -0
  13. pretrained_model/sd-vae-ft-mse/diffusion_pytorch_model.safetensors +3 -0
  14. pretrained_model/stable-diffusion-v1-5/feature_extractor/preprocessor_config.json +20 -0
  15. pretrained_model/stable-diffusion-v1-5/model_index.json +32 -0
  16. pretrained_model/stable-diffusion-v1-5/unet/config.json +36 -0
  17. pretrained_model/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin +3 -0
  18. pretrained_model/stable-diffusion-v1-5/v1-inference.yaml +70 -0
  19. pretrained_model/wav2vec2-base-960h/.gitattributes +18 -0
  20. pretrained_model/wav2vec2-base-960h/README.md +128 -0
  21. pretrained_model/wav2vec2-base-960h/config.json +77 -0
  22. pretrained_model/wav2vec2-base-960h/feature_extractor_config.json +8 -0
  23. pretrained_model/wav2vec2-base-960h/model.safetensors +3 -0
  24. pretrained_model/wav2vec2-base-960h/preprocessor_config.json +8 -0
  25. pretrained_model/wav2vec2-base-960h/pytorch_model.bin +3 -0
  26. pretrained_model/wav2vec2-base-960h/special_tokens_map.json +1 -0
  27. pretrained_model/wav2vec2-base-960h/tf_model.h5 +3 -0
  28. pretrained_model/wav2vec2-base-960h/tokenizer_config.json +1 -0
  29. pretrained_model/wav2vec2-base-960h/vocab.json +1 -0
pretrained_model/Put pre-trained weights here.txt ADDED
File without changes
pretrained_model/audio2mesh.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04996bebdad780a33642b0046036dae5d3c6db76e8f4ef5860e551fb9a1f0a1a
3
+ size 382031763
pretrained_model/denoising_unet.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddc4990e0d33dd5393190e8609fd7b32bfc0b5c386763624a3bff8038e0c054c
3
+ size 3438374981
pretrained_model/image_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/jpinkney/.cache/huggingface/diffusers/models--lambdalabs--sd-image-variations-diffusers/snapshots/ca6f97f838ae1b5bf764f31363a21f388f4d8f3e/image_encoder",
3
+ "architectures": [
4
+ "CLIPVisionModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "quick_gelu",
9
+ "hidden_size": 1024,
10
+ "image_size": 224,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 24,
19
+ "patch_size": 14,
20
+ "projection_dim": 768,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.25.1"
23
+ }
pretrained_model/image_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89d2aa29b5fdf64f3ad4f45fb4227ea98bc45156bbae673b85be1af7783dbabb
3
+ size 1215993967
pretrained_model/motion_module.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:954aee616a81f143e0316d210445d1933cca05a397c760661b7046738c4c1f06
3
+ size 1817900817
pretrained_model/pose_guider.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6e8a6c72efcbb4ed49421baeb2c218b6047e94f5f0c90b554748019f757e64f
3
+ size 670182863
pretrained_model/reference_unet.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e3603f7d44917ca3330a4a4be81b9578b308e9b9e3398fd6b8a1c4c86c474bd
3
+ size 3438324501
pretrained_model/sd-vae-ft-mse/.gitattributes ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
26
+ *.tflite filter=lfs diff=lfs merge=lfs -text
27
+ *.tgz filter=lfs diff=lfs merge=lfs -text
28
+ *.wasm filter=lfs diff=lfs merge=lfs -text
29
+ *.xz filter=lfs diff=lfs merge=lfs -text
30
+ *.zip filter=lfs diff=lfs merge=lfs -text
31
+ *.zst filter=lfs diff=lfs merge=lfs -text
32
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
33
+ diffusion_pytorch_model.safetensors filter=lfs diff=lfs merge=lfs -text
pretrained_model/sd-vae-ft-mse/README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - stable-diffusion
5
+ - stable-diffusion-diffusers
6
+ inference: false
7
+ ---
8
+ # Improved Autoencoders
9
+
10
+ ## Utilizing
11
+ These weights are intended to be used with the [🧨 diffusers library](https://github.com/huggingface/diffusers). If you are looking for the model to use with the original [CompVis Stable Diffusion codebase](https://github.com/CompVis/stable-diffusion), [come here](https://huggingface.co/stabilityai/sd-vae-ft-mse-original).
12
+
13
+ #### How to use with 🧨 diffusers
14
+ You can integrate this fine-tuned VAE decoder to your existing `diffusers` workflows, by including a `vae` argument to the `StableDiffusionPipeline`
15
+ ```py
16
+ from diffusers.models import AutoencoderKL
17
+ from diffusers import StableDiffusionPipeline
18
+
19
+ model = "CompVis/stable-diffusion-v1-4"
20
+ vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse")
21
+ pipe = StableDiffusionPipeline.from_pretrained(model, vae=vae)
22
+ ```
23
+
24
+ ## Decoder Finetuning
25
+ We publish two kl-f8 autoencoder versions, finetuned from the original [kl-f8 autoencoder](https://github.com/CompVis/latent-diffusion#pretrained-autoencoding-models) on a 1:1 ratio of [LAION-Aesthetics](https://laion.ai/blog/laion-aesthetics/) and LAION-Humans, an unreleased subset containing only SFW images of humans. The intent was to fine-tune on the Stable Diffusion training set (the autoencoder was originally trained on OpenImages) but also enrich the dataset with images of humans to improve the reconstruction of faces.
26
+ The first, _ft-EMA_, was resumed from the original checkpoint, trained for 313198 steps and uses EMA weights. It uses the same loss configuration as the original checkpoint (L1 + LPIPS).
27
+ The second, _ft-MSE_, was resumed from _ft-EMA_ and uses EMA weights and was trained for another 280k steps using a different loss, with more emphasis
28
+ on MSE reconstruction (MSE + 0.1 * LPIPS). It produces somewhat ``smoother'' outputs. The batch size for both versions was 192 (16 A100s, batch size 12 per GPU).
29
+ To keep compatibility with existing models, only the decoder part was finetuned; the checkpoints can be used as a drop-in replacement for the existing autoencoder.
30
+
31
+ _Original kl-f8 VAE vs f8-ft-EMA vs f8-ft-MSE_
32
+
33
+ ## Evaluation
34
+ ### COCO 2017 (256x256, val, 5000 images)
35
+ | Model | train steps | rFID | PSNR | SSIM | PSIM | Link | Comments
36
+ |----------|---------|------|--------------|---------------|---------------|-----------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|
37
+ | | | | | | | | |
38
+ | original | 246803 | 4.99 | 23.4 +/- 3.8 | 0.69 +/- 0.14 | 1.01 +/- 0.28 | https://ommer-lab.com/files/latent-diffusion/kl-f8.zip | as used in SD |
39
+ | ft-EMA | 560001 | 4.42 | 23.8 +/- 3.9 | 0.69 +/- 0.13 | 0.96 +/- 0.27 | https://huggingface.co/stabilityai/sd-vae-ft-ema-original/resolve/main/vae-ft-ema-560000-ema-pruned.ckpt | slightly better overall, with EMA |
40
+ | ft-MSE | 840001 | 4.70 | 24.5 +/- 3.7 | 0.71 +/- 0.13 | 0.92 +/- 0.27 | https://huggingface.co/stabilityai/sd-vae-ft-mse-original/resolve/main/vae-ft-mse-840000-ema-pruned.ckpt | resumed with EMA from ft-EMA, emphasis on MSE (rec. loss = MSE + 0.1 * LPIPS), smoother outputs |
41
+
42
+
43
+ ### LAION-Aesthetics 5+ (256x256, subset, 10000 images)
44
+ | Model | train steps | rFID | PSNR | SSIM | PSIM | Link | Comments
45
+ |----------|-----------|------|--------------|---------------|---------------|-----------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|
46
+ | | | | | | | | |
47
+ | original | 246803 | 2.61 | 26.0 +/- 4.4 | 0.81 +/- 0.12 | 0.75 +/- 0.36 | https://ommer-lab.com/files/latent-diffusion/kl-f8.zip | as used in SD |
48
+ | ft-EMA | 560001 | 1.77 | 26.7 +/- 4.8 | 0.82 +/- 0.12 | 0.67 +/- 0.34 | https://huggingface.co/stabilityai/sd-vae-ft-ema-original/resolve/main/vae-ft-ema-560000-ema-pruned.ckpt | slightly better overall, with EMA |
49
+ | ft-MSE | 840001 | 1.88 | 27.3 +/- 4.7 | 0.83 +/- 0.11 | 0.65 +/- 0.34 | https://huggingface.co/stabilityai/sd-vae-ft-mse-original/resolve/main/vae-ft-mse-840000-ema-pruned.ckpt | resumed with EMA from ft-EMA, emphasis on MSE (rec. loss = MSE + 0.1 * LPIPS), smoother outputs |
50
+
51
+
52
+ ### Visual
53
+ _Visualization of reconstructions on 256x256 images from the COCO2017 validation dataset._
54
+
55
+ <p align="center">
56
+ <br>
57
+ <b>
58
+ 256x256: ft-EMA (left), ft-MSE (middle), original (right)</b>
59
+ </p>
60
+
61
+ <p align="center">
62
+ <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00025_merged.png />
63
+ </p>
64
+
65
+ <p align="center">
66
+ <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00011_merged.png />
67
+ </p>
68
+
69
+ <p align="center">
70
+ <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00037_merged.png />
71
+ </p>
72
+
73
+ <p align="center">
74
+ <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00043_merged.png />
75
+ </p>
76
+
77
+ <p align="center">
78
+ <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00053_merged.png />
79
+ </p>
80
+
81
+ <p align="center">
82
+ <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00029_merged.png />
83
+ </p>
pretrained_model/sd-vae-ft-mse/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.4.2",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "in_channels": 3,
18
+ "latent_channels": 4,
19
+ "layers_per_block": 2,
20
+ "norm_num_groups": 32,
21
+ "out_channels": 3,
22
+ "sample_size": 256,
23
+ "up_block_types": [
24
+ "UpDecoderBlock2D",
25
+ "UpDecoderBlock2D",
26
+ "UpDecoderBlock2D",
27
+ "UpDecoderBlock2D"
28
+ ]
29
+ }
pretrained_model/sd-vae-ft-mse/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b4889b6b1d4ce7ae320a02dedaeff1780ad77d415ea0d744b476155c6377ddc
3
+ size 334707217
pretrained_model/sd-vae-ft-mse/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1d993488569e928462932c8c38a0760b874d166399b14414135bd9c42df5815
3
+ size 334643276
pretrained_model/stable-diffusion-v1-5/feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 224,
3
+ "do_center_crop": true,
4
+ "do_convert_rgb": true,
5
+ "do_normalize": true,
6
+ "do_resize": true,
7
+ "feature_extractor_type": "CLIPFeatureExtractor",
8
+ "image_mean": [
9
+ 0.48145466,
10
+ 0.4578275,
11
+ 0.40821073
12
+ ],
13
+ "image_std": [
14
+ 0.26862954,
15
+ 0.26130258,
16
+ 0.27577711
17
+ ],
18
+ "resample": 3,
19
+ "size": 224
20
+ }
pretrained_model/stable-diffusion-v1-5/model_index.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableDiffusionPipeline",
3
+ "_diffusers_version": "0.6.0",
4
+ "feature_extractor": [
5
+ "transformers",
6
+ "CLIPImageProcessor"
7
+ ],
8
+ "safety_checker": [
9
+ "stable_diffusion",
10
+ "StableDiffusionSafetyChecker"
11
+ ],
12
+ "scheduler": [
13
+ "diffusers",
14
+ "PNDMScheduler"
15
+ ],
16
+ "text_encoder": [
17
+ "transformers",
18
+ "CLIPTextModel"
19
+ ],
20
+ "tokenizer": [
21
+ "transformers",
22
+ "CLIPTokenizer"
23
+ ],
24
+ "unet": [
25
+ "diffusers",
26
+ "UNet2DConditionModel"
27
+ ],
28
+ "vae": [
29
+ "diffusers",
30
+ "AutoencoderKL"
31
+ ]
32
+ }
pretrained_model/stable-diffusion-v1-5/unet/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.6.0",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "center_input_sample": false,
13
+ "cross_attention_dim": 768,
14
+ "down_block_types": [
15
+ "CrossAttnDownBlock2D",
16
+ "CrossAttnDownBlock2D",
17
+ "CrossAttnDownBlock2D",
18
+ "DownBlock2D"
19
+ ],
20
+ "downsample_padding": 1,
21
+ "flip_sin_to_cos": true,
22
+ "freq_shift": 0,
23
+ "in_channels": 4,
24
+ "layers_per_block": 2,
25
+ "mid_block_scale_factor": 1,
26
+ "norm_eps": 1e-05,
27
+ "norm_num_groups": 32,
28
+ "out_channels": 4,
29
+ "sample_size": 64,
30
+ "up_block_types": [
31
+ "UpBlock2D",
32
+ "CrossAttnUpBlock2D",
33
+ "CrossAttnUpBlock2D",
34
+ "CrossAttnUpBlock2D"
35
+ ]
36
+ }
pretrained_model/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7da0e21ba7ea50637bee26e81c220844defdf01aafca02b2c42ecdadb813de4
3
+ size 3438354725
pretrained_model/stable-diffusion-v1-5/v1-inference.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-04
3
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.0120
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "jpg"
11
+ cond_stage_key: "txt"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false # Note: different from the one we trained before
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+
20
+ scheduler_config: # 10000 warmup steps
21
+ target: ldm.lr_scheduler.LambdaLinearScheduler
22
+ params:
23
+ warm_up_steps: [ 10000 ]
24
+ cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
25
+ f_start: [ 1.e-6 ]
26
+ f_max: [ 1. ]
27
+ f_min: [ 1. ]
28
+
29
+ unet_config:
30
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
31
+ params:
32
+ image_size: 32 # unused
33
+ in_channels: 4
34
+ out_channels: 4
35
+ model_channels: 320
36
+ attention_resolutions: [ 4, 2, 1 ]
37
+ num_res_blocks: 2
38
+ channel_mult: [ 1, 2, 4, 4 ]
39
+ num_heads: 8
40
+ use_spatial_transformer: True
41
+ transformer_depth: 1
42
+ context_dim: 768
43
+ use_checkpoint: True
44
+ legacy: False
45
+
46
+ first_stage_config:
47
+ target: ldm.models.autoencoder.AutoencoderKL
48
+ params:
49
+ embed_dim: 4
50
+ monitor: val/rec_loss
51
+ ddconfig:
52
+ double_z: true
53
+ z_channels: 4
54
+ resolution: 256
55
+ in_channels: 3
56
+ out_ch: 3
57
+ ch: 128
58
+ ch_mult:
59
+ - 1
60
+ - 2
61
+ - 4
62
+ - 4
63
+ num_res_blocks: 2
64
+ attn_resolutions: []
65
+ dropout: 0.0
66
+ lossconfig:
67
+ target: torch.nn.Identity
68
+
69
+ cond_stage_config:
70
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
pretrained_model/wav2vec2-base-960h/.gitattributes ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
18
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
pretrained_model/wav2vec2-base-960h/README.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ datasets:
4
+ - librispeech_asr
5
+ tags:
6
+ - audio
7
+ - automatic-speech-recognition
8
+ - hf-asr-leaderboard
9
+ license: apache-2.0
10
+ widget:
11
+ - example_title: Librispeech sample 1
12
+ src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
13
+ - example_title: Librispeech sample 2
14
+ src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
15
+ model-index:
16
+ - name: wav2vec2-base-960h
17
+ results:
18
+ - task:
19
+ name: Automatic Speech Recognition
20
+ type: automatic-speech-recognition
21
+ dataset:
22
+ name: LibriSpeech (clean)
23
+ type: librispeech_asr
24
+ config: clean
25
+ split: test
26
+ args:
27
+ language: en
28
+ metrics:
29
+ - name: Test WER
30
+ type: wer
31
+ value: 3.4
32
+ - task:
33
+ name: Automatic Speech Recognition
34
+ type: automatic-speech-recognition
35
+ dataset:
36
+ name: LibriSpeech (other)
37
+ type: librispeech_asr
38
+ config: other
39
+ split: test
40
+ args:
41
+ language: en
42
+ metrics:
43
+ - name: Test WER
44
+ type: wer
45
+ value: 8.6
46
+ ---
47
+
48
+ # Wav2Vec2-Base-960h
49
+
50
+ [Facebook's Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
51
+
52
+ The base model pretrained and fine-tuned on 960 hours of Librispeech on 16kHz sampled speech audio. When using the model
53
+ make sure that your speech input is also sampled at 16Khz.
54
+
55
+ [Paper](https://arxiv.org/abs/2006.11477)
56
+
57
+ Authors: Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli
58
+
59
+ **Abstract**
60
+
61
+ We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks the speech input in the latent space and solves a contrastive task defined over a quantization of the latent representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech recognition with limited amounts of labeled data.
62
+
63
+ The original model can be found under https://github.com/pytorch/fairseq/tree/master/examples/wav2vec#wav2vec-20.
64
+
65
+
66
+ # Usage
67
+
68
+ To transcribe audio files the model can be used as a standalone acoustic model as follows:
69
+
70
+ ```python
71
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
72
+ from datasets import load_dataset
73
+ import torch
74
+
75
+ # load model and tokenizer
76
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
77
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
78
+
79
+ # load dummy dataset and read soundfiles
80
+ ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
81
+
82
+ # tokenize
83
+ input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").input_values # Batch size 1
84
+
85
+ # retrieve logits
86
+ logits = model(input_values).logits
87
+
88
+ # take argmax and decode
89
+ predicted_ids = torch.argmax(logits, dim=-1)
90
+ transcription = processor.batch_decode(predicted_ids)
91
+ ```
92
+
93
+ ## Evaluation
94
+
95
+ This code snippet shows how to evaluate **facebook/wav2vec2-base-960h** on LibriSpeech's "clean" and "other" test data.
96
+
97
+ ```python
98
+ from datasets import load_dataset
99
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
100
+ import torch
101
+ from jiwer import wer
102
+
103
+
104
+ librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
105
+
106
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
107
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
108
+
109
+ def map_to_pred(batch):
110
+ input_values = processor(batch["audio"]["array"], return_tensors="pt", padding="longest").input_values
111
+ with torch.no_grad():
112
+ logits = model(input_values.to("cuda")).logits
113
+
114
+ predicted_ids = torch.argmax(logits, dim=-1)
115
+ transcription = processor.batch_decode(predicted_ids)
116
+ batch["transcription"] = transcription
117
+ return batch
118
+
119
+ result = librispeech_eval.map(map_to_pred, batched=True, batch_size=1, remove_columns=["audio"])
120
+
121
+ print("WER:", wer(result["text"], result["transcription"]))
122
+ ```
123
+
124
+ *Result (WER)*:
125
+
126
+ | "clean" | "other" |
127
+ |---|---|
128
+ | 3.4 | 8.6 |
pretrained_model/wav2vec2-base-960h/config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-base-960h",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "codevector_dim": 256,
11
+ "contrastive_logits_temperature": 0.1,
12
+ "conv_bias": false,
13
+ "conv_dim": [
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512
21
+ ],
22
+ "conv_kernel": [
23
+ 10,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 3,
28
+ 2,
29
+ 2
30
+ ],
31
+ "conv_stride": [
32
+ 5,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2,
38
+ 2
39
+ ],
40
+ "ctc_loss_reduction": "sum",
41
+ "ctc_zero_infinity": false,
42
+ "diversity_loss_weight": 0.1,
43
+ "do_stable_layer_norm": false,
44
+ "eos_token_id": 2,
45
+ "feat_extract_activation": "gelu",
46
+ "feat_extract_dropout": 0.0,
47
+ "feat_extract_norm": "group",
48
+ "feat_proj_dropout": 0.1,
49
+ "feat_quantizer_dropout": 0.0,
50
+ "final_dropout": 0.1,
51
+ "gradient_checkpointing": false,
52
+ "hidden_act": "gelu",
53
+ "hidden_dropout": 0.1,
54
+ "hidden_dropout_prob": 0.1,
55
+ "hidden_size": 768,
56
+ "initializer_range": 0.02,
57
+ "intermediate_size": 3072,
58
+ "layer_norm_eps": 1e-05,
59
+ "layerdrop": 0.1,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_prob": 0.05,
64
+ "model_type": "wav2vec2",
65
+ "num_attention_heads": 12,
66
+ "num_codevector_groups": 2,
67
+ "num_codevectors_per_group": 320,
68
+ "num_conv_pos_embedding_groups": 16,
69
+ "num_conv_pos_embeddings": 128,
70
+ "num_feat_extract_layers": 7,
71
+ "num_hidden_layers": 12,
72
+ "num_negatives": 100,
73
+ "pad_token_id": 0,
74
+ "proj_codevector_dim": 256,
75
+ "transformers_version": "4.7.0.dev0",
76
+ "vocab_size": 32
77
+ }
pretrained_model/wav2vec2-base-960h/feature_extractor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_dim": 1,
4
+ "padding_side": "right",
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": false,
7
+ "sampling_rate": 16000
8
+ }
pretrained_model/wav2vec2-base-960h/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aa76ab2243c81747a1f832954586bc566090c83a0ac167df6f31f0fa917d74a
3
+ size 377607901
pretrained_model/wav2vec2-base-960h/preprocessor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_size": 1,
4
+ "padding_side": "right",
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": false,
7
+ "sampling_rate": 16000
8
+ }
pretrained_model/wav2vec2-base-960h/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c34f9827b034a1b9141dbf6f652f8a60eda61cdf5771c9e05bfa99033c92cd96
3
+ size 377667514
pretrained_model/wav2vec2-base-960h/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
pretrained_model/wav2vec2-base-960h/tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:412742825972a6e2e877255ccd8b3416e618df15a7f1e5e4f736aa3632ce33b5
3
+ size 377840624
pretrained_model/wav2vec2-base-960h/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "return_attention_mask": false, "do_normalize": true}
pretrained_model/wav2vec2-base-960h/vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "E": 5, "T": 6, "A": 7, "O": 8, "N": 9, "I": 10, "H": 11, "S": 12, "R": 13, "D": 14, "L": 15, "U": 16, "M": 17, "W": 18, "C": 19, "F": 20, "G": 21, "Y": 22, "P": 23, "B": 24, "V": 25, "K": 26, "'": 27, "X": 28, "J": 29, "Q": 30, "Z": 31}