Spaces:

VDNT11
/

AIML_project

Sleeping

App Files Files Community

AIML_project / app.py

VDNT11

Update app.py

8476397 verified about 1 year ago

raw

history blame contribute delete

7.52 kB

	import os
	import subprocess
	import sys

	# Clone required repositories
	def clone_repositories():
	repos = [
	('https://github.com/AI4Bharat/IndicTrans2.git', 'indictrans2'),
	('https://github.com/VarunGumma/IndicTransToolkit.git', 'indictranstoolkit')
	]

	for repo_url, repo_dir in repos:
	if not os.path.exists(repo_dir):
	subprocess.check_call(['git', 'clone', repo_url, repo_dir])
	sys.path.append(os.path.abspath(repo_dir))

	# Clone repositories before importing
	clone_repositories()

	import streamlit as st
	import torch
	import librosa
	import matplotlib.pyplot as plt
	from PIL import Image
	import torchaudio
	from transformers import (
	AutoModelForSpeechSeq2Seq,
	AutoProcessor,
	pipeline,
	AutoModelForSeq2SeqLM,
	AutoTokenizer,
	BitsAndBytesConfig
	)
	from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler, StableDiffusionImg2ImgPipeline
	import stanza
	import numpy as np

	from IndicTransToolkit import IndicProcessor

	class TransGen:
	def __init__(
	self,
	translation_model="ai4bharat/indictrans2-indic-en-1B",
	stable_diff_model="stabilityai/stable-diffusion-2-base",
	src_lang='hin_Deva',
	tgt_lang='eng_Latn'
	):
	self.bnb_config = BitsAndBytesConfig(load_in_4bit=True)
	self.tokenizer = AutoTokenizer.from_pretrained(translation_model, trust_remote_code=True)
	self.model = AutoModelForSeq2SeqLM.from_pretrained(translation_model, trust_remote_code=True, quantization_config=self.bnb_config)
	self.ip = IndicProcessor(inference=True)
	self.src_lang = src_lang
	self.tgt_lang = tgt_lang

	scheduler = EulerDiscreteScheduler.from_pretrained(stable_diff_model, subfolder="scheduler")
	self.pipe = StableDiffusionPipeline.from_pretrained(stable_diff_model, scheduler=scheduler, torch_dtype=torch.bfloat16)
	self.pipe = self.pipe.to("cuda")

	self.img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(stable_diff_model, torch_dtype=torch.float16)
	self.img2img_pipe = self.img2img_pipe.to('cuda')

	def translate(self, input_sentences):
	batch = self.ip.preprocess_batch(
	input_sentences,
	src_lang=self.src_lang,
	tgt_lang=self.tgt_lang,
	)
	inputs = self.tokenizer(
	batch,
	truncation=True,
	padding="longest",
	return_tensors="pt",
	return_attention_mask=True,
	)

	with torch.no_grad():
	generated_tokens = self.model.generate(
	**inputs,
	use_cache=True,
	min_length=0,
	max_length=256,
	num_beams=5,
	num_return_sequences=1,
	)

	with self.tokenizer.as_target_tokenizer():
	generated_tokens = self.tokenizer.batch_decode(
	generated_tokens.detach().cpu().tolist(),
	skip_special_tokens=True,
	clean_up_tokenization_spaces=True,
	)

	translations = self.ip.postprocess_batch(generated_tokens, lang=self.tgt_lang)
	return translations

	def generate_image(self, prompt, prev_image, strength=1.0, guidance_scale=7.5):
	strength = float(strength) if strength is not None else 1.0
	guidance_scale = float(guidance_scale) if guidance_scale is not None else 7.5

	strength = max(0.0, min(1.0, strength))

	if prev_image is not None:
	image = self.img2img_pipe(
	prompt,
	image=prev_image,
	strength=strength,
	guidance_scale=guidance_scale,
	negative_prompt='generate text in image'
	).images[0]
	return image

	image = self.pipe(prompt)
	return image.images[0]

	def run(self, input_sentences, strength, guidance_scale, prev_image=None):
	translations = self.translate(input_sentences)
	sentence = translations[0]
	image = self.generate_image(sentence, prev_image, strength, guidance_scale)
	return sentence, image

	def transcribe_audio_to_hindi(audio_path: str) -> str:
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	model_id = "openai/whisper-large-v3"
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
	)
	model.to(device)

	processor = AutoProcessor.from_pretrained(model_id)

	whisper_pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	torch_dtype=torch_dtype,
	device=device,
	model_kwargs={"language": "hi"}
	)

	waveform, sample_rate = torchaudio.load(audio_path)

	if sample_rate != 16000:
	resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
	waveform = resampler(waveform)

	result = whisper_pipe(waveform.squeeze(0).cpu().numpy(), return_timestamps=True)
	return result["text"]

	# Download Stanza resources
	stanza.download('hi')
	nlp = stanza.Pipeline(lang='hi', processors='tokenize,pos')

	def POS_policy(input_text):
	doc = nlp(input_text)
	words = doc.sentences[-1].words
	n = len(words)
	i = n-1

	while i >= 0:
	if words[i].upos in ['NOUN', 'VERB']:
	return i
	i -= 1
	return 0

	def generate_images_from_audio(audio_path, base_strength=0.8, base_guidance_scale=12):
	text_tot = transcribe_audio_to_hindi(audio_path)

	st.write(f'Transcripted sentence: {text_tot}')

	cur_sent = ''
	prev_idx = 0
	generated_images = []
	transgen = TransGen()

	for word in text_tot.split():
	cur_sent += word + ' '

	str_idx = POS_policy(cur_sent)

	if str_idx != 0 and str_idx != prev_idx:
	prev_idx = str_idx

	sent, image = transgen.run(
	[cur_sent],
	base_strength,
	base_guidance_scale,
	image if 'image' in locals() else None
	)

	generated_images.append({
	'sentence': cur_sent,
	'image': image
	})

	return generated_images

	def main():
	st.title("Audio to Image Generation App")

	# File uploader
	uploaded_file = st.file_uploader("Choose a WAV audio file", type="wav")

	# Strength and Guidance Scale sliders
	base_strength = st.slider("Image Generation Strength", min_value=0.0, max_value=1.0, value=0.8, step=0.1)
	base_guidance_scale = st.slider("Guidance Scale", min_value=1.0, max_value=20.0, value=12.0, step=0.5)

	if uploaded_file is not None:
	# Save the uploaded file temporarily
	with open("temp_audio.wav", "wb") as f:
	f.write(uploaded_file.getvalue())

	# Generate images
	st.write("Generating Images...")
	generated_images = generate_images_from_audio("temp_audio.wav", base_strength, base_guidance_scale)

	# Display generated images
	st.write("Generated Images:")
	for img_data in generated_images:
	st.image(img_data['image'], caption=img_data['sentence'])

	if __name__ == "__main__":
	main()