Duplicate from riffusion/riffusion-model-v1

Browse files

Co-authored-by: Seth Forsgren <sethforsgren@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +34 -0
README.md +88 -0
feature_extractor/preprocessor_config.json +20 -0
model_index.json +32 -0
riffusion-model-v1.ckpt +3 -0
safety_checker/config.json +179 -0
safety_checker/pytorch_model.bin +3 -0
scheduler/scheduler_config.json +12 -0
seed_images/.DS_Store +0 -0
seed_images/agile.png +0 -0
seed_images/cut32_epic.png +0 -0
seed_images/cut32_folksy.png +0 -0
seed_images/cut32_funk.png +0 -0
seed_images/cut32_hustle.png +0 -0
seed_images/cut32_latin.png +0 -0
seed_images/cut32_lounge.png +0 -0
seed_images/cut32_pressure.png +0 -0
seed_images/cut32_rock.png +0 -0
seed_images/cut32_stomp.png +0 -0
seed_images/cut32_wild.png +0 -0
seed_images/cut_epic.png +0 -0
seed_images/cut_folksy.png +0 -0
seed_images/cut_funk.png +0 -0
seed_images/cut_hustle.png +0 -0
seed_images/cut_latin.png +0 -0
seed_images/cut_lounge.png +0 -0
seed_images/cut_pressure.png +0 -0
seed_images/cut_rock.png +0 -0
seed_images/cut_stomp.png +0 -0
seed_images/cut_wild.png +0 -0
seed_images/epic.png +0 -0
seed_images/folksy.png +0 -0
seed_images/funk.png +0 -0
seed_images/hustle.png +0 -0
seed_images/latin.png +0 -0
seed_images/lounge.png +0 -0
seed_images/marim.png +0 -0
seed_images/mask_beat_lines_80.png +0 -0
seed_images/mask_gradient_dark.png +0 -0
seed_images/mask_gradient_top_70.png +0 -0
seed_images/mask_gradient_top_fifth_75.png +0 -0
seed_images/mask_top_third_75.png +0 -0
seed_images/mask_top_third_95.png +0 -0
seed_images/motorway.png +0 -0
seed_images/og_beat.png +0 -0
seed_images/pressure.png +0 -0
seed_images/rock.png +0 -0
seed_images/stomp.png +0 -0
seed_images/vibes.png +0 -0
seed_images/wild.png +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+---
+license: creativeml-openrail-m
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- text-to-audio
+inference: true
+extra_gated_prompt: >-
+  This model is open access and available to all, with a CreativeML OpenRAIL-M
+  license further specifying rights and usage.
+  The CreativeML OpenRAIL License specifies:
+  1. You can't use the model to deliberately produce nor share illegal or
+  harmful outputs or content
+  2. Riffusion claims no rights on the outputs you generate, you are free to use
+  them and are accountable for their use which must not go against the
+  provisions set in the license
+  3. You may re-distribute the weights and use the model commercially and/or as
+  a service. If you do, please be aware you have to include the same use
+  restrictions as the ones in the license and share a copy of the CreativeML
+  OpenRAIL-M to all your users (please read the license entirely and carefully)
+  Please read the full license carefully here:
+  https://huggingface.co/spaces/CompVis/stable-diffusion-license
+extra_gated_heading: Please read the LICENSE to access this model
+duplicated_from: riffusion/riffusion-model-v1
+---
+# Riffusion
+Riffusion is an app for real-time music generation with stable diffusion.
+Read about it at https://www.riffusion.com/about and try it at https://www.riffusion.com/.
+* Web app: https://github.com/hmartiro/riffusion-app
+* Inference server: https://github.com/hmartiro/riffusion-inference
+* Model checkpoint: https://huggingface.co/riffusion/riffusion-model-v1
+This repository contains the model files, including:
+ * a diffusers formated library
+ * a compiled checkpoint file
+ * a traced unet for improved inference speed
+ * a seed image library for use with riffusion-app
+# Riffusion v1 Model
+Riffusion is a latent text-to-image diffusion model capable of generating spectrogram images given any text input. These spectrograms can be converted into audio clips.
+The model was created by [Seth Forsgren](https://sethforsgren.com/) and [Hayk Martiros](https://haykmartiros.com/) as a hobby project.
+You can use the Riffusion model directly, or try the [Riffusion web app](https://www.riffusion.com/).
+The Riffusion model was created by fine-tuning the **Stable-Diffusion-v1-5** checkpoint. Read about Stable Diffusion here [🤗's Stable Diffusion blog](https://huggingface.co/blog/stable_diffusion).
+## Model Details
+- **Developed by:** Seth Forsgren, Hayk Martiros
+- **Model type:** Diffusion-based text-to-image generation model
+- **Language(s):** English
+- **License:** [The CreativeML OpenRAIL M license](https://huggingface.co/spaces/CompVis/stable-diffusion-license) is an [Open RAIL M license](https://www.licenses.ai/blog/2022/8/18/naming-convention-of-responsible-ai-licenses), adapted from the work that [BigScience](https://bigscience.huggingface.co/) and [the RAIL Initiative](https://www.licenses.ai/) are jointly carrying in the area of responsible AI licensing. See also [the article about the BLOOM Open RAIL license](https://bigscience.huggingface.co/blog/the-bigscience-rail-license) on which our license is based.
+- **Model Description:** This is a model that can be used to generate and modify images based on text prompts. It is a [Latent Diffusion Model](https://arxiv.org/abs/2112.10752) that uses a fixed, pretrained text encoder ([CLIP ViT-L/14](https://arxiv.org/abs/2103.00020)) as suggested in the [Imagen paper](https://arxiv.org/abs/2205.11487).
+## Direct Use
+The model is intended for research purposes only. Possible research areas and
+tasks include
+- Generation of artworks, audio, and use in creative processes.
+- Applications in educational or creative tools.
+- Research on generative models.
+## Citation
+If you build on this work, please cite it as follows:
+```
+@software{Forsgren_Martiros_2022,
+  author = {Forsgren, Seth* and Martiros, Hayk*},
+  title = {{Riffusion - Stable diffusion for real-time music generation}},
+  url = {https://riffusion.com/about},
+  year = {2022}
+}
+```

feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "crop_size": 224,
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "size": 224
+}

model_index.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_class_name": "StableDiffusionPipeline",
+  "_diffusers_version": "0.4.0",
+  "feature_extractor": [
+    "transformers",
+    "CLIPFeatureExtractor"
+  ],
+  "safety_checker": [
+    "stable_diffusion",
+    "StableDiffusionSafetyChecker"
+  ],
+  "scheduler": [
+    "diffusers",
+    "PNDMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

riffusion-model-v1.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99a6eb51c18e16a6121180f3daa69344e571618b195533f67ae94be4eb135a57
+size 14580603928

safety_checker/config.json ADDED Viewed

	@@ -0,0 +1,179 @@

+{
+  "_commit_hash": "4bb648a606ef040e7685bde262611766a5fdd67b",
+  "_name_or_path": "CompVis/stable-diffusion-safety-checker",
+  "architectures": [
+    "StableDiffusionSafetyChecker"
+  ],
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 768,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.24.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vocab_size": 49408
+  },
+  "text_config_dict": {
+    "hidden_size": 768,
+    "intermediate_size": 3072,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.24.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "vision_config_dict": {
+    "hidden_size": 1024,
+    "intermediate_size": 4096,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "patch_size": 14
+  }
+}

safety_checker/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16d28f2b37109f222cdc33620fdd262102ac32112be0352a7f77e9614b35a394
+size 1216064769

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "_class_name": "PNDMScheduler",
+  "_diffusers_version": "0.4.0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "num_train_timesteps": 1000,
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "trained_betas": null
+}