Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import os | |
| import tempfile | |
| from moviepy.editor import ImageSequenceClip, concatenate_videoclips | |
| from PIL import Image | |
| import torch | |
| from diffusers import AudioLDMPipeline | |
| from transformers import AutoProcessor, ClapModel, BlipProcessor, BlipForConditionalGeneration | |
| # make Space compatible with CPU duplicates | |
| if torch.cuda.is_available(): | |
| device = "cuda" | |
| torch_dtype = torch.float16 | |
| else: | |
| device = "cpu" | |
| torch_dtype = torch.float32 | |
| # load the diffusers pipeline | |
| repo_id = "cvssp/audioldm-m-full" | |
| pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device) | |
| pipe.unet = torch.compile(pipe.unet) | |
| # CLAP model (only required for automatic scoring) | |
| clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device) | |
| processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full") | |
| generator = torch.Generator(device) | |
| # Charger le modèle et le processeur Blip pour la description d'images | |
| image_caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
| image_caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
| # Streamlit app setup | |
| st.set_page_config( | |
| page_title="Text to Media", | |
| page_icon="📷 🎵", | |
| ) | |
| st.title("Générateur de Diaporama Vidéo et Musique") | |
| # Sélectionnez les images | |
| uploaded_files = st.file_uploader("Sélectionnez des images (PNG, JPG, JPEG)", type=["png", "jpg", "jpeg"], accept_multiple_files=True) | |
| if uploaded_files: | |
| # Créez un répertoire temporaire pour stocker les images | |
| temp_dir = tempfile.mkdtemp() | |
| # Enregistrez les images téléchargées dans le répertoire temporaire | |
| image_paths = [] | |
| descriptions = [] # Pour stocker les descriptions générées | |
| for i, uploaded_file in enumerate(uploaded_files): | |
| image_path = os.path.join(temp_dir, uploaded_file.name) | |
| with open(image_path, 'wb') as f: | |
| f.write(uploaded_file.read()) | |
| image_paths.append(image_path) | |
| # Générez la légende pour chaque image | |
| try: | |
| image = Image.open(image_path).convert("RGB") | |
| inputs = image_caption_processor(image, return_tensors="pt") | |
| out = image_caption_model.generate(**inputs) | |
| caption = image_caption_processor.decode(out[0], skip_special_tokens=True) | |
| descriptions.append(caption) | |
| except Exception as e: | |
| descriptions.append("Erreur lors de la génération de la légende") | |
| # Affichez les images avec leurs descriptions | |
| for i, image_path in enumerate(image_paths): | |
| st.image(image_path, caption=f"Description : {descriptions[i]}", use_column_width=True) | |
| # Créez une vidéo à partir des images | |
| st.header("Création d'une Diapositive Vidéo") | |
| # Sélectionnez la durée d'affichage de chaque image avec une barre horizontale (en secondes) | |
| image_duration = st.slider("Sélectionnez la durée d'affichage de chaque image (en secondes)", 1, 10, 4) | |
| # Débit d'images par seconde (calculé en fonction de la durée de chaque image) | |
| frame_rate = 1 / image_duration | |
| image_clips = [ImageSequenceClip([image_path], fps=frame_rate, durations=[image_duration]) for image_path in image_paths] | |
| final_clip = concatenate_videoclips(image_clips, method="compose") | |
| final_clip_path = os.path.join(temp_dir, "slideshow.mp4") | |
| final_clip.write_videofile(final_clip_path, codec='libx264', fps=frame_rate) | |
| # Afficher la vidéo | |
| st.video(open(final_clip_path, 'rb').read()) | |
| # Générez de la musique à partir des descriptions | |
| st.header("Génération de Musique à partir des Descriptions") | |
| # Utilisez les descriptions générées pour la musique | |
| music_input = "\n".join(descriptions) | |
| st.text_area("Descriptions pour la musique", music_input, height=200) | |
| # Configuration de la musique | |
| seed = st.number_input("Seed", value=45) | |
| duration = st.slider("Duration (seconds)", 2.5, 10.0, 5.0, 2.5) | |
| guidance_scale = st.slider("Guidance scale", 0.0, 4.0, 2.5, 0.5) | |
| n_candidates = st.slider("Number waveforms to generate", 1, 3, 3, 1) | |
| def score_waveforms(text, waveforms): | |
| inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True) | |
| inputs = {key: inputs[key].to(device) for key in inputs} | |
| with torch.no_grad(): | |
| logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score | |
| probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities | |
| most_probable = torch.argmax(probs) # and now select the most likely audio waveform | |
| waveform = waveforms[most_probable] | |
| return waveform | |
| if st.button("Générer de la musique"): | |
| waveforms = pipe( | |
| music_input, | |
| audio_length_in_s=duration, | |
| guidance_scale=guidance_scale, | |
| num_inference_steps=100, | |
| num_waveforms_per_prompt=n_candidates if n_candidates else 1, | |
| generator=generator.manual_seed(int(seed)), | |
| )["audios"] | |
| if waveforms.shape[0] > 1: | |
| waveform = score_waveforms(music_input, waveforms) | |
| else: | |
| waveform = waveforms[0] | |
| # Afficher le lecteur audio | |
| st.audio(waveform, format="audio/wav", sample_rate=16000) | |