Spaces:

JuanjoJ55
/

Audio_Summarizer

Sleeping

App Files Files Community

JuanjoSG5 commited on Nov 18, 2024

Commit

39cf578

1 Parent(s): 651c146

feat: increased the efficiency of the transcription

Browse files

Files changed (3) hide show

README.md +4 -3
app.py +29 -17
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -9,14 +9,15 @@ app_file: app.py
 pinned: false
 short_description: Transcribes an audio and creates a summary
 ---
 # Limitations
 I have tested the application with audio files of varying lengths. Initially, I attempted processing audios of 1 to 2 hours,
 but due to hardware constraints, my PC was unable to handle files of that size effectively.
-After testing, I found that the application operates best with audio files under 20 minutes, although this 20 minutes should be consider the longest length I would recommend, since the app processes shorter audios much more effectively. For example, a stereo audio file that is around 20 minutes long usually takes about 15 to 18 minutes to process. This processing time may vary depending on the capabilities of your PC.
-For users with high-performance computers, it may be possible to process longer audio files. However, for consistent and reliable results, I recommend audios around the length of 10 to 15 minutes.
 # Main Use

 pinned: false
 short_description: Transcribes an audio and creates a summary
 ---
 # Limitations
 I have tested the application with audio files of varying lengths. Initially, I attempted processing audios of 1 to 2 hours,
 but due to hardware constraints, my PC was unable to handle files of that size effectively.
+S
+After testing, I found that the application operates best with audio files under 20 minutes, although this 20 minutes should be consider the longest length I would recommend, since the app processes shorter audios much more effectively. For example, a stereo audio file that is around 20 minutes long usually takes about 10 to 12 minutes to process, but again i wouldn't recommend suing this model for such audio files. This processing time may vary depending on the capabilities of your PC.
+For users with high-performance computers, it may be possible to process longer audio files. However, for consistent and reliable results, I recommend audios around the length of 10 to 15 minutes, which it usually takes 3 minutes for 10 minute files and around 5 min for 15 minutes.
 # Main Use

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, BartForConditionalGeneration
 import torch
-import librosa
 # Load BART tokenizer and model for summarization
 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
@@ -16,24 +16,37 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 summarizer.to(device)
 def transcribe_and_summarize(audioFile):
-    # Load audio as an array
-    audio, sampling_rate = librosa.load(audioFile, sr=16000)  # Ensure it's 16kHz for Wav2Vec2
-    values = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_values
-    # Move tensors to GPU if available
-    values = values.to(device)
-    # Transcription
-    with torch.no_grad():
-        logits = model(values).logits
-        predictedIDs = torch.argmax(logits, dim=-1)
-        transcription = processor.batch_decode(predictedIDs, skip_special_tokens=True)[0]
-    # Summarization
-    inputs = tokenizer(transcription, return_tensors="pt", truncation=True, max_length=1024)
-    inputs = inputs.to(device)  # Move inputs to GPU
     result = summarizer.generate(
         inputs["input_ids"],
         min_length=10,
@@ -41,12 +54,12 @@ def transcribe_and_summarize(audioFile):
         no_repeat_ngram_size=2,
         encoder_no_repeat_ngram_size=2,
         repetition_penalty=2.0,
-        num_beams=4,
         early_stopping=True,
     )
     summary = tokenizer.decode(result[0], skip_special_tokens=True)
-    return transcription, summary
 # Gradio interface
 iface = gr.Interface(
@@ -58,4 +71,3 @@ iface = gr.Interface(
 )
 iface.launch()

 import gradio as gr
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, BartForConditionalGeneration
 import torch
+import torchaudio  # Replace librosa for faster audio processing
 # Load BART tokenizer and model for summarization
 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 model.to(device)
 summarizer.to(device)
+model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
+summarizer = torch.quantization.quantize_dynamic(summarizer, {torch.nn.Linear}, dtype=torch.qint8)
 def transcribe_and_summarize(audioFile):
+    # Load audio using torchaudio
+    audio, sampling_rate = torchaudio.load(audioFile)
+    # Resample audio to 16kHz if necessary
+    if sampling_rate != 16000:
+        resample_transform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
+        audio = resample_transform(audio)
+    audio = audio.squeeze()
+    # Process audio in chunks for large files
+    chunk_size = int(16000 * 30)  # 10-second chunks
+    transcription = ""
+    for i in range(0, len(audio), chunk_size):
+        chunk = audio[i:i+chunk_size].numpy()
+        inputs = processor(chunk, sampling_rate=16000, return_tensors="pt").input_values.to(device)
+        # Transcription
+        with torch.no_grad():
+            logits = model(inputs).logits
+            predicted_ids = torch.argmax(logits, dim=-1)
+            transcription += processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] + " "
+    # Summarization
+    inputs = tokenizer(transcription, return_tensors="pt", truncation=True, max_length=1024).to(device)
     result = summarizer.generate(
         inputs["input_ids"],
         min_length=10,
         no_repeat_ngram_size=2,
         encoder_no_repeat_ngram_size=2,
         repetition_penalty=2.0,
+        num_beams=2,  # Reduced beams for faster inference
         early_stopping=True,
     )
     summary = tokenizer.decode(result[0], skip_special_tokens=True)
+    return transcription.strip(), summary.strip()
 # Gradio interface
 iface = gr.Interface(
 )
 iface.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 gradio
 transformers
 torch
-librosa

 gradio
 transformers
 torch
+torchaudio