Spaces:
Sleeping
Sleeping
| # imports for | |
| from datasets import Audio, load_dataset, Dataset | |
| import torch | |
| from transformers import pipeline | |
| from pathlib import Path | |
| # for the UI by Gradio | |
| import gradio as gr | |
| import pandas as pd | |
| # initializing the values for device | |
| if torch.cuda.is_available(): | |
| DEVICE = "cuda:0" | |
| TORCH_DTYPE = torch.float16 | |
| else: | |
| DEVICE = "cpu" | |
| TORCH_DTYPE = torch.float32 | |
| # MODEL_NAME = 'openai/whisper-large-v3' | |
| MODEL_NAME = 'openai/whisper-small' | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=MODEL_NAME, | |
| torch_dtype=TORCH_DTYPE, | |
| device=DEVICE, | |
| ) | |
| def convert_audio_2_array(files): | |
| file_paths = files | |
| complaint_data = Dataset.from_dict({ | |
| "audio" : [file_paths] # path to be changed based on the path | |
| }).cast_column("audio",Audio(sampling_rate=16000)) | |
| file_name = Path(file_paths).name | |
| print(f"file_paths: \n {file_paths} and \n file_name: {file_name}and \n complaint_data : \n {complaint_data} ") | |
| return file_name, complaint_data | |
| def v2t_convertor(files): | |
| file_name,inputs_dict = convert_audio_2_array(files) | |
| input = inputs_dict[0] # selecting only one input | |
| org_complain_dict = pipe(input["audio"].copy(), max_new_tokens=256, generate_kwargs={"task": "transcribe"}) | |
| print('f{org_complain_dict}') | |
| org_complain = org_complain_dict['text'] | |
| # lang_token = pipe.model.generate(input, max_new_tokens=1)[0,1] | |
| # language_code = pipe.tokenizer.decode(lang_token) | |
| language_code = "hi" # hard coded for the time being | |
| eng_complain = pipe(input["audio"].copy(), max_new_tokens=256, generate_kwargs={"task": "translate"})['text'] | |
| print(f"org_complain: \n {org_complain} \nand\n eng_complain:\n {eng_complain} \n language_code: {language_code}") | |
| return [[file_name, org_complain, eng_complain, language_code]] | |
| def upload_file(files): # the actual translation should happen here | |
| """ | |
| takes the file that comes from the UI and converts it to the respective | |
| format to be sent to the model for transcription | |
| """ | |
| # Define the column names | |
| columns = ["audio_id", "transcribed_text( in org lang )", "transcribed_text( in eng )", "language"] | |
| # Define the data as lists. Here the data would be sent in the form of single data fro mthe data | |
| # data = [ ["ca_1.wav", "बिना किसी पूर्व सूचना के विलंबित या रद्द की गई ट्रिनिक", "without any prior information or any delay or delay in the train journey", "hindi"]] | |
| data = v2t_convertor(files) | |
| # Create the DataFrameoutputs | |
| if data is not None: | |
| df = pd.DataFrame(data, columns=columns) | |
| else: | |
| raise ValueError("Data is None. Cannot create DataFrame.") | |
| return df | |
| with gr.Blocks() as demo: | |
| gr.Markdown( "# V2T Engine" ) | |
| with gr.Accordion("Steps to run the App"): | |
| gr.Markdown("1. Click \"Click to Upload a File\" to open a dialog box and browse your local files.") | |
| gr.Markdown("2. The uploaded audio will be transcribed in the original language and translated into English.") | |
| gr.Markdown("3. The transcriptions and translations will be displayed in a table, which is how they will be stored in the database.") | |
| upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio"], file_count="single") | |
| df_output = gr.Dataframe( | |
| headers=["audio_id", "transcribed_text( in eng )", "transcribed_text( in org lang )", "language"], | |
| datatype=["str", "str", "str"], | |
| row_count=1, | |
| col_count=(4, "fixed"), | |
| wrap=True | |
| ) | |
| upload_button.upload(upload_file, upload_button, df_output, show_progress = True) # upload the audio file and and sends to the upload function | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) | |