baharbhz commited on
Commit
5124efb
·
verified ·
1 Parent(s): 55a8763

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -16
app.py CHANGED
@@ -37,16 +37,28 @@ output_dir = "extracted_model"
37
  subprocess.run(["unzip", zip_file, "-d", output_dir], check=True)
38
 
39
 
 
 
 
 
 
 
40
  # Function for inference from an audio file path
41
  def infer_from_audio_file(audio_file_path, model, processor, device="cpu"):
42
  # Load audio file
43
  audio, sampling_rate = librosa.load(audio_file_path, sr=16000)
44
 
45
- # Process the audio using the feature extractor from the processor
46
- inputs = processor(audio, sampling_rate=sampling_rate).input_values[0]
47
- input_features = [{"input_values": inputs}]
 
 
 
 
 
 
48
 
49
- batch = processor.pad(
50
  input_features,
51
  padding=True,
52
  max_length=None,
@@ -54,22 +66,26 @@ def infer_from_audio_file(audio_file_path, model, processor, device="cpu"):
54
  return_tensors="pt",
55
  )
56
 
57
- # Move inputs to the correct device
58
- input_values = batch.input_values.to(device)
59
 
60
- # Ensure the model is in evaluation mode
61
- model.eval()
62
 
63
- with torch.no_grad():
64
- # Make predictions
65
- outputs = model(input_values)
66
- logits = outputs.logits
 
 
 
 
67
 
68
- # Decode the predictions
69
- pred_ids = torch.argmax(logits, dim=-1)
70
- pred_str = processor.batch_decode(pred_ids.cpu().numpy())
71
 
72
- return pred_str[0] # Return the decoded transcription of the audio
 
 
73
 
74
 
75
  tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|")
 
37
  subprocess.run(["unzip", zip_file, "-d", output_dir], check=True)
38
 
39
 
40
+ # Function to split audio into chunks
41
+ def split_audio(audio, sampling_rate, chunk_size=30):
42
+ chunk_length = chunk_size * sampling_rate
43
+ chunks = [audio[i:i + chunk_length] for i in range(0, len(audio), chunk_length)]
44
+ return chunks
45
+
46
  # Function for inference from an audio file path
47
  def infer_from_audio_file(audio_file_path, model, processor, device="cpu"):
48
  # Load audio file
49
  audio, sampling_rate = librosa.load(audio_file_path, sr=16000)
50
 
51
+ # Split audio into chunks of at most 30 seconds
52
+ chunks = split_audio(audio, sampling_rate)
53
+
54
+ transcriptions = []
55
+
56
+ for chunk in chunks:
57
+ # Process the audio using the feature extractor from the processor
58
+ inputs = processor(chunk, sampling_rate=sampling_rate).input_values[0]
59
+ input_features = [{"input_values": inputs}]
60
 
61
+ batch = processor.pad(
62
  input_features,
63
  padding=True,
64
  max_length=None,
 
66
  return_tensors="pt",
67
  )
68
 
69
+ # Move inputs to the correct device
70
+ input_values = batch.input_values.to(device)
71
 
72
+ # Ensure the model is in evaluation mode
73
+ model.eval()
74
 
75
+ with torch.no_grad():
76
+ # Make predictions
77
+ outputs = model(input_values)
78
+ logits = outputs.logits
79
+
80
+ # Decode the predictions
81
+ pred_ids = torch.argmax(logits, dim=-1)
82
+ pred_str = processor.batch_decode(pred_ids.cpu().numpy())
83
 
84
+ transcriptions.append(pred_str[0])
 
 
85
 
86
+ # Concatenate the transcriptions
87
+ full_transcription = ' '.join(transcriptions)
88
+ return full_transcription
89
 
90
 
91
  tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|")