from transformers import pipeline#,SpeechT5ForTextToSpeech,SpeechT5Processor,SpeechT5HifiGan # import torch # from datasets import load_dataset # import soundfile as sf # import matplotlib.pyplot as plt class Trans: def __init__(self,model_name): self.model_name=model_name self.state=False def initiate(self,**kwargs): self.state=True def predict(self,input): if self.state: self.response=self.pipe(input) else: raise Exception('Model not initiated') def output(self): pass def process(self): pass class Recognizer(Trans): def __init__(self,model_name): super().__init__(model_name) def initiate(self,**kwargs): self.state=True self.pipe= pipeline("automatic-speech-recognition",self.model_name,**kwargs) # def predict(self,file): # if self.state: # self.response=self.pipe(file) # else: # raise Exception('Model not initiated') def output(self): if self.state: try: return(self.response['text']) except AttributeError: print('Error: No file was transcribed') else: raise Exception('Error: Model not initiated') def process(self,input): self.initiate() self.predict(input=input) return self.output() # class Speaker(Trans): # def __init__(self,model_name): # super().__init__(model_name) # def initiate(self,**kwargs): # self.state=True # self.model=SpeechT5ForTextToSpeech.from_pretrained(self.model_name) # self.processor = SpeechT5Processor.from_pretrained(self.model_name) # self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # def predict(self,text): # if self.state: # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") # self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) # inputs= self.processor(text=text, return_tensors="pt") # self.response= self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder) # else: # raise Exception('Model not initiated') # def save(self,file_name="speech.wav"): # if self.state: # try: # sf.write(file_name, self.response.numpy(), samplerate=16000) # except AttributeError: # print('Error: No text was converted to audio') # else: # raise Exception('Error: Model not initiated') # def spectogram(self): # plt.figure() # plt.imshow(self.response.T) # plt.show() # def process(self,input,file_name="speech.wav"): # self.initiate() # self.predict(text=input) # self.save(file_name) class Translator(Trans): def __init__(self,model_name): super().__init__(model_name) def initiate(self,**kwargs): self.state=True self.pipe= pipeline("translation",self.model_name,**kwargs) def output(self): if self.state: try: return(self.response[0]['translation_text']) except AttributeError: print('Error: No file was transcribed') else: raise Exception('Error: Model not initiated') def process(self,input): self.initiate() self.predict(input=input) return self.output() # r=Recognizer('openai/whisper-tiny.en') # text_eng=r.process('preamble10.wav') # print(text_eng) # t=Translator("Helsinki-NLP/opus-mt-en-fr") # fre_text=t.process(text_eng) # print(fre_text) # s=Speaker("microsoft/speecht5_tts") # s.process(fre_text) # with open("output.wav", "rb") as f: # data = f.read() # pipe = pipeline("automatic-speech-recognition", "openai/whisper-tiny.en") # print(pipe('preamble10.wav'))