Spaces:

VDNT11
/

AIML_project

Sleeping

App Files Files Community

VDNT11 commited on Nov 22, 2024

Commit

b10f48a

verified ·

1 Parent(s): c3e8af7

Update app.py

Browse files

Files changed (1) hide show

app.py +197 -64

app.py CHANGED Viewed

@@ -1,82 +1,215 @@
-import torch
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import streamlit as st
-from pydub import AudioSegment
 import os
-import soundfile as sf
-import uuid
-# Set device and dtype
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-@st.cache_resource
-def load_model():
-    # Use a specific Hindi-optimized Whisper model
-    model_id = "openai/whisper-large-v2"  # or consider a multilingual model
-    # For Hindi, you might want to specify additional parameters
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        model_id,
-        torch_dtype=torch_dtype,
-        low_cpu_mem_usage=True,
-        use_safetensors=True,
     )
     model.to(device)
-    # Use the processor from the same model
     processor = AutoProcessor.from_pretrained(model_id)
-    # Create pipeline with language specification
-    pipe = pipeline(
         "automatic-speech-recognition",
         model=model,
         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
         torch_dtype=torch_dtype,
         device=device,
-        generate_kwargs={"language": "hi"}  # Specify Hindi language
     )
-    return pipe, processor
-# Load model and processor
-pipe, processor = load_model()
-# Streamlit UI
-st.title("Hindi Audio to Text Transcription")
-uploaded_file = st.file_uploader(
-    "Upload a .wav audio file for transcription", type=["wav"]
-)
-if uploaded_file is not None:
-    st.info("Processing uploaded file...")
-    temp_filename = f"temp_audio_{uuid.uuid4()}.wav"
-    with open(temp_filename, "wb") as f:
-        f.write(uploaded_file.read())
-    # Preprocess the audio
-    sound = AudioSegment.from_file(temp_filename)
-    sound = sound.set_channels(1)  # Convert to mono
-    sound.export(temp_filename, format="wav")  # Save the processed file
-    audio, _ = sf.read(temp_filename)  # Read audio data
-    # Preprocess the audio for the model
-    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    # Perform transcription
-    with torch.no_grad():
-        outputs = pipe.model.generate(**inputs)
-        transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-    # Display the transcription
-    st.success("Transcription complete!")
-    st.markdown(f"### Transcription:\n\n{transcription}")
-    os.remove(temp_filename)  # Clean up temporary file
-else:
-    st.warning("Please upload a .wav file to start transcription.")

 import streamlit as st
+import torch
+import librosa
+import matplotlib.pyplot as plt
+from PIL import Image
 import os
+# Import the required functions and classes from your previous code
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import torchaudio
+import torch
+from transformers import (
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+)
+from IndicTransToolkit import IndicProcessor
+from transformers import BitsAndBytesConfig
+from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
+from diffusers import StableDiffusionImg2ImgPipeline
+import stanza
+# Ensure you have the same TransGen class and other supporting functions from your previous implementation
+class TransGen:
+    def __init__(self, translation_model="ai4bharat/indictrans2-indic-en-1B",
+                 stable_diff_model="stabilityai/stable-diffusion-2-base",
+                 src_lang='hin_Deva', tgt_lang='eng_Latn'):
+        # Same implementation as in your previous code
+        self.bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(translation_model, trust_remote_code=True)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(translation_model, trust_remote_code=True, quantization_config=self.bnb_config)
+        self.ip = IndicProcessor(inference=True)
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        scheduler = EulerDiscreteScheduler.from_pretrained(stable_diff_model, subfolder="scheduler")
+        self.pipe = StableDiffusionPipeline.from_pretrained(stable_diff_model, scheduler=scheduler, torch_dtype=torch.bfloat16)
+        self.pipe = self.pipe.to("cuda")
+        self.img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(stable_diff_model, torch_dtype=torch.float16)
+        self.img2img_pipe = self.img2img_pipe.to('cuda')
+    def translate(self, input_sentences):
+        # Same implementation as in your previous code
+        batch = self.ip.preprocess_batch(
+            input_sentences,
+            src_lang=self.src_lang,
+            tgt_lang=self.tgt_lang,
+        )
+        inputs = self.tokenizer(
+            batch,
+            truncation=True,
+            padding="longest",
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        with torch.no_grad():
+            generated_tokens = self.model.generate(
+                **inputs,
+                use_cache=True,
+                min_length=0,
+                max_length=256,
+                num_beams=5,
+                num_return_sequences=1,
+            )
+        with self.tokenizer.as_target_tokenizer():
+            generated_tokens = self.tokenizer.batch_decode(
+                generated_tokens.detach().cpu().tolist(),
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True,
+            )
+        translations = self.ip.postprocess_batch(generated_tokens, lang=self.tgt_lang)
+        return translations
+    def generate_image(self, prompt, prev_image, strength=1.0, guidance_scale=7.5):
+        # Same implementation as in your previous code
+        strength = float(strength) if strength is not None else 1.0
+        guidance_scale = float(guidance_scale) if guidance_scale is not None else 7.5
+        strength = max(0.0, min(1.0, strength))
+        if prev_image is not None:
+            image = self.img2img_pipe(
+                prompt,
+                image=prev_image,
+                strength=strength,
+                guidance_scale=guidance_scale,
+                negative_prompt='generate text in image'
+            ).images[0]
+            return image
+        image = self.pipe(prompt)
+        return image.images[0]
+    def run(self, input_sentences, strength, guidance_scale, prev_image=None):
+        # Same implementation as in your previous code
+        translations = self.translate(input_sentences)
+        sentence = translations[0]
+        image = self.generate_image(sentence, prev_image, strength, guidance_scale)
+        return sentence, image
+# Initialize global variables
+stanza.download('hi')
+transgen = TransGen()
+def transcribe_audio_to_hindi(audio_path: str) -> str:
+    # Same implementation as in your previous code
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    model_id = "openai/whisper-large-v3"
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
     )
     model.to(device)
     processor = AutoProcessor.from_pretrained(model_id)
+    whisper_pipe = pipeline(
         "automatic-speech-recognition",
         model=model,
         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
         torch_dtype=torch_dtype,
         device=device,
+        model_kwargs={"language": "hi"}
     )
+    waveform, sample_rate = torchaudio.load(audio_path)
+    if sample_rate != 16000:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+        waveform = resampler(waveform)
+    result = whisper_pipe(waveform.squeeze(0).cpu().numpy(), return_timestamps=True)
+    return result["text"]
+nlp = stanza.Pipeline(lang='hi', processors='tokenize,pos')
+def POS_policy(input):
+    # Same implementation as in your previous code
+    lst = input
+    doc = nlp(lst)
+    words = doc.sentences[-1].words
+    n = len(words)
+    i = n-1
+    while(i):
+        if words[i].upos == 'NOUN' or words[i].upos == 'VERB':
+            return i
+        else:
+            pass
+        i -= 1
+    return 0
+def generate_images_from_audio(audio_path, base_strength=0.8, base_guidance_scale=12):
+    # Similar implementation with modifications for Streamlit
+    text_tot = transcribe_audio_to_hindi(audio_path)
+    st.write(f'Transcripted sentence: {text_tot}')
+    cur_sent = ''
+    prev_idx = 0
+    generated_images = []
+    for word in text_tot.split():
+        cur_sent += word + ' '
+        str_idx = POS_policy(cur_sent)
+        if str_idx != 0 and str_idx != prev_idx:
+            prev_idx = str_idx
+            sent, image = transgen.run(
+                [cur_sent],
+                base_strength,
+                base_guidance_scale,
+                image if 'image' in locals() else None
+            )
+            generated_images.append({
+                'sentence': cur_sent,
+                'image': image
+            })
+    return generated_images
+def main():
+    st.title("Audio to Image Generation App")
+    # File uploader
+    uploaded_file = st.file_uploader("Choose a WAV audio file", type="wav")
+    # Strength and Guidance Scale sliders
+    base_strength = st.slider("Image Generation Strength", min_value=0.0, max_value=1.0, value=0.8, step=0.1)
+    base_guidance_scale = st.slider("Guidance Scale", min_value=1.0, max_value=20.0, value=12.0, step=0.5)
+    if uploaded_file is not None:
+        # Save the uploaded file temporarily
+        with open("temp_audio.wav", "wb") as f:
+            f.write(uploaded_file.getvalue())
+        # Generate images
+        st.write("Generating Images...")
+        generated_images = generate_images_from_audio("temp_audio.wav", base_strength, base_guidance_scale)
+        # Display generated images
+        st.write("Generated Images:")
+        for img_data in generated_images:
+            st.image(img_data['image'], caption=img_data['sentence'])
+if __name__ == "__main__":
+    main()