speechtotext / recognizer.py
Cathaltwo's picture
first commit of files
e29c878
from transformers import pipeline#,SpeechT5ForTextToSpeech,SpeechT5Processor,SpeechT5HifiGan
# import torch
# from datasets import load_dataset
# import soundfile as sf
# import matplotlib.pyplot as plt
class Trans:
def __init__(self,model_name):
self.model_name=model_name
self.state=False
def initiate(self,**kwargs):
self.state=True
def predict(self,input):
if self.state:
self.response=self.pipe(input)
else:
raise Exception('Model not initiated')
def output(self):
pass
def process(self):
pass
class Recognizer(Trans):
def __init__(self,model_name):
super().__init__(model_name)
def initiate(self,**kwargs):
self.state=True
self.pipe= pipeline("automatic-speech-recognition",self.model_name,**kwargs)
# def predict(self,file):
# if self.state:
# self.response=self.pipe(file)
# else:
# raise Exception('Model not initiated')
def output(self):
if self.state:
try:
return(self.response['text'])
except AttributeError:
print('Error: No file was transcribed')
else:
raise Exception('Error: Model not initiated')
def process(self,input):
self.initiate()
self.predict(input=input)
return self.output()
# class Speaker(Trans):
# def __init__(self,model_name):
# super().__init__(model_name)
# def initiate(self,**kwargs):
# self.state=True
# self.model=SpeechT5ForTextToSpeech.from_pretrained(self.model_name)
# self.processor = SpeechT5Processor.from_pretrained(self.model_name)
# self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# def predict(self,text):
# if self.state:
# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# inputs= self.processor(text=text, return_tensors="pt")
# self.response= self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder)
# else:
# raise Exception('Model not initiated')
# def save(self,file_name="speech.wav"):
# if self.state:
# try:
# sf.write(file_name, self.response.numpy(), samplerate=16000)
# except AttributeError:
# print('Error: No text was converted to audio')
# else:
# raise Exception('Error: Model not initiated')
# def spectogram(self):
# plt.figure()
# plt.imshow(self.response.T)
# plt.show()
# def process(self,input,file_name="speech.wav"):
# self.initiate()
# self.predict(text=input)
# self.save(file_name)
class Translator(Trans):
def __init__(self,model_name):
super().__init__(model_name)
def initiate(self,**kwargs):
self.state=True
self.pipe= pipeline("translation",self.model_name,**kwargs)
def output(self):
if self.state:
try:
return(self.response[0]['translation_text'])
except AttributeError:
print('Error: No file was transcribed')
else:
raise Exception('Error: Model not initiated')
def process(self,input):
self.initiate()
self.predict(input=input)
return self.output()
# r=Recognizer('openai/whisper-tiny.en')
# text_eng=r.process('preamble10.wav')
# print(text_eng)
# t=Translator("Helsinki-NLP/opus-mt-en-fr")
# fre_text=t.process(text_eng)
# print(fre_text)
# s=Speaker("microsoft/speecht5_tts")
# s.process(fre_text)
# with open("output.wav", "rb") as f:
# data = f.read()
# pipe = pipeline("automatic-speech-recognition", "openai/whisper-tiny.en")
# print(pipe('preamble10.wav'))