Spaces:
Sleeping
Sleeping
| import os | |
| import subprocess | |
| import sys | |
| # Clone required repositories | |
| def clone_repositories(): | |
| repos = [ | |
| ('https://github.com/AI4Bharat/IndicTrans2.git', 'indictrans2'), | |
| ('https://github.com/VarunGumma/IndicTransToolkit.git', 'indictranstoolkit') | |
| ] | |
| for repo_url, repo_dir in repos: | |
| if not os.path.exists(repo_dir): | |
| subprocess.check_call(['git', 'clone', repo_url, repo_dir]) | |
| sys.path.append(os.path.abspath(repo_dir)) | |
| # Clone repositories before importing | |
| clone_repositories() | |
| import streamlit as st | |
| import torch | |
| import librosa | |
| import matplotlib.pyplot as plt | |
| from PIL import Image | |
| import torchaudio | |
| from transformers import ( | |
| AutoModelForSpeechSeq2Seq, | |
| AutoProcessor, | |
| pipeline, | |
| AutoModelForSeq2SeqLM, | |
| AutoTokenizer, | |
| BitsAndBytesConfig | |
| ) | |
| from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler, StableDiffusionImg2ImgPipeline | |
| import stanza | |
| import numpy as np | |
| from IndicTransToolkit import IndicProcessor | |
| class TransGen: | |
| def __init__( | |
| self, | |
| translation_model="ai4bharat/indictrans2-indic-en-1B", | |
| stable_diff_model="stabilityai/stable-diffusion-2-base", | |
| src_lang='hin_Deva', | |
| tgt_lang='eng_Latn' | |
| ): | |
| self.bnb_config = BitsAndBytesConfig(load_in_4bit=True) | |
| self.tokenizer = AutoTokenizer.from_pretrained(translation_model, trust_remote_code=True) | |
| self.model = AutoModelForSeq2SeqLM.from_pretrained(translation_model, trust_remote_code=True, quantization_config=self.bnb_config) | |
| self.ip = IndicProcessor(inference=True) | |
| self.src_lang = src_lang | |
| self.tgt_lang = tgt_lang | |
| scheduler = EulerDiscreteScheduler.from_pretrained(stable_diff_model, subfolder="scheduler") | |
| self.pipe = StableDiffusionPipeline.from_pretrained(stable_diff_model, scheduler=scheduler, torch_dtype=torch.bfloat16) | |
| self.pipe = self.pipe.to("cuda") | |
| self.img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(stable_diff_model, torch_dtype=torch.float16) | |
| self.img2img_pipe = self.img2img_pipe.to('cuda') | |
| def translate(self, input_sentences): | |
| batch = self.ip.preprocess_batch( | |
| input_sentences, | |
| src_lang=self.src_lang, | |
| tgt_lang=self.tgt_lang, | |
| ) | |
| inputs = self.tokenizer( | |
| batch, | |
| truncation=True, | |
| padding="longest", | |
| return_tensors="pt", | |
| return_attention_mask=True, | |
| ) | |
| with torch.no_grad(): | |
| generated_tokens = self.model.generate( | |
| **inputs, | |
| use_cache=True, | |
| min_length=0, | |
| max_length=256, | |
| num_beams=5, | |
| num_return_sequences=1, | |
| ) | |
| with self.tokenizer.as_target_tokenizer(): | |
| generated_tokens = self.tokenizer.batch_decode( | |
| generated_tokens.detach().cpu().tolist(), | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=True, | |
| ) | |
| translations = self.ip.postprocess_batch(generated_tokens, lang=self.tgt_lang) | |
| return translations | |
| def generate_image(self, prompt, prev_image, strength=1.0, guidance_scale=7.5): | |
| strength = float(strength) if strength is not None else 1.0 | |
| guidance_scale = float(guidance_scale) if guidance_scale is not None else 7.5 | |
| strength = max(0.0, min(1.0, strength)) | |
| if prev_image is not None: | |
| image = self.img2img_pipe( | |
| prompt, | |
| image=prev_image, | |
| strength=strength, | |
| guidance_scale=guidance_scale, | |
| negative_prompt='generate text in image' | |
| ).images[0] | |
| return image | |
| image = self.pipe(prompt) | |
| return image.images[0] | |
| def run(self, input_sentences, strength, guidance_scale, prev_image=None): | |
| translations = self.translate(input_sentences) | |
| sentence = translations[0] | |
| image = self.generate_image(sentence, prev_image, strength, guidance_scale) | |
| return sentence, image | |
| def transcribe_audio_to_hindi(audio_path: str) -> str: | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| model_id = "openai/whisper-large-v3" | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
| ) | |
| model.to(device) | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| whisper_pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| model_kwargs={"language": "hi"} | |
| ) | |
| waveform, sample_rate = torchaudio.load(audio_path) | |
| if sample_rate != 16000: | |
| resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
| waveform = resampler(waveform) | |
| result = whisper_pipe(waveform.squeeze(0).cpu().numpy(), return_timestamps=True) | |
| return result["text"] | |
| # Download Stanza resources | |
| stanza.download('hi') | |
| nlp = stanza.Pipeline(lang='hi', processors='tokenize,pos') | |
| def POS_policy(input_text): | |
| doc = nlp(input_text) | |
| words = doc.sentences[-1].words | |
| n = len(words) | |
| i = n-1 | |
| while i >= 0: | |
| if words[i].upos in ['NOUN', 'VERB']: | |
| return i | |
| i -= 1 | |
| return 0 | |
| def generate_images_from_audio(audio_path, base_strength=0.8, base_guidance_scale=12): | |
| text_tot = transcribe_audio_to_hindi(audio_path) | |
| st.write(f'Transcripted sentence: {text_tot}') | |
| cur_sent = '' | |
| prev_idx = 0 | |
| generated_images = [] | |
| transgen = TransGen() | |
| for word in text_tot.split(): | |
| cur_sent += word + ' ' | |
| str_idx = POS_policy(cur_sent) | |
| if str_idx != 0 and str_idx != prev_idx: | |
| prev_idx = str_idx | |
| sent, image = transgen.run( | |
| [cur_sent], | |
| base_strength, | |
| base_guidance_scale, | |
| image if 'image' in locals() else None | |
| ) | |
| generated_images.append({ | |
| 'sentence': cur_sent, | |
| 'image': image | |
| }) | |
| return generated_images | |
| def main(): | |
| st.title("Audio to Image Generation App") | |
| # File uploader | |
| uploaded_file = st.file_uploader("Choose a WAV audio file", type="wav") | |
| # Strength and Guidance Scale sliders | |
| base_strength = st.slider("Image Generation Strength", min_value=0.0, max_value=1.0, value=0.8, step=0.1) | |
| base_guidance_scale = st.slider("Guidance Scale", min_value=1.0, max_value=20.0, value=12.0, step=0.5) | |
| if uploaded_file is not None: | |
| # Save the uploaded file temporarily | |
| with open("temp_audio.wav", "wb") as f: | |
| f.write(uploaded_file.getvalue()) | |
| # Generate images | |
| st.write("Generating Images...") | |
| generated_images = generate_images_from_audio("temp_audio.wav", base_strength, base_guidance_scale) | |
| # Display generated images | |
| st.write("Generated Images:") | |
| for img_data in generated_images: | |
| st.image(img_data['image'], caption=img_data['sentence']) | |
| if __name__ == "__main__": | |
| main() |