from transformers import pipeline, AutoModel, AutoTokenizer, AutoFeatureExtractor, Wav2Vec2ForCTC, AutoModelForSequenceClassification import gradio as gr model = Wav2Vec2ForCTC.from_pretrained("./asr") tokenizer = AutoTokenizer.from_pretrained("./asr") feature_extractor = AutoFeatureExtractor.from_pretrained("./asr") asr = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor ) model = AutoModelForSequenceClassification.from_pretrained("./tc") tokenizer = AutoTokenizer.from_pretrained("./tc") classifier = pipeline("text-classification", model=model, tokenizer=tokenizer) # asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h") # classifier = pipeline("text-classification") def speech_to_text(speech): text = asr(speech)["text"] return text def text_to_sentiment(text): return classifier(text)[0]["label"] demo = gr.Blocks() with demo: audio_file = gr.Audio(type="filepath") text = gr.Textbox() # text2 = gr.Textbox() label = gr.Label() b1 = gr.Button("Recognize Speech") # b2 = gr.Button("Classify") b1.click(speech_to_text, inputs=audio_file, outputs=text) text.change(text_to_sentiment, inputs=text, outputs=label) # b2.click(text_to_sentiment, inputs=text, outputs=label) # text.change() demo.launch()