File size: 3,172 Bytes
fc9f129
 
 
 
9c9499a
 
fc9f129
 
9c9499a
fc9f129
 
 
 
 
 
 
 
 
9c9499a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc9f129
 
 
9c9499a
938237f
 
 
9c9499a
 
 
 
 
fc9f129
 
 
 
9c9499a
fc9f129
 
9c9499a
fc9f129
 
 
 
 
 
 
 
 
9c9499a
 
 
fc9f129
9c9499a
 
 
 
 
938237f
 
 
 
fc9f129
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import streamlit as st
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
import torch
import librosa
import soundfile as sf
import io

# Load models
@st.cache_resource
def load_models():
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    return processor, model, summarizer

processor, model, summarizer = load_models()

# Function to convert audio to text
def audio_to_text(audio_file):
    try:
        # Read the audio file from BytesIO
        audio_bytes = audio_file.read()
        audio_file.seek(0)  # Reset the file pointer

        # Use soundfile to read the audio data
        with io.BytesIO(audio_bytes) as f:
            data, samplerate = sf.read(f)

        # Resample to 16kHz if necessary
        if samplerate != 16000:
            data = librosa.resample(data, orig_sr=samplerate, target_sr=16000)

        # Convert to input values for the model
        input_values = processor(data, return_tensors="pt", sampling_rate=16000).input_values

        # Perform inference
        with torch.no_grad():
            logits = model(input_values).logits

        # Decode the output
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])
        return transcription
    except Exception as e:
        st.error(f"Error processing audio: {e}")
        return None

# Function to summarize text
def summarize_text(text):
    try:
        if len(text.strip()) < 10:  # Check if the text is too short
            st.warning("The transcribed text is too short to summarize.")
            return None
        summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        st.error(f"Error summarizing text: {e}")
        return None

# Streamlit app
def main():
    st.title("Audio Summarization App")
    st.write("Upload an audio file (WAV or MP3) to get a summary of its content.")

    # File uploader
    audio_file = st.file_uploader("Upload Audio File", type=["wav", "mp3"])

    if audio_file is not None:
        st.audio(audio_file, format="audio/wav")

        # Process the audio file
        if st.button("Generate Summary"):
            with st.spinner("Processing audio..."):
                # Convert audio to text
                text = audio_to_text(audio_file)
                if text:
                    st.subheader("Transcribed Text:")
                    st.write(text)

                    # Summarize the text
                    summary = summarize_text(text)
                    if summary:
                        st.subheader("Summary:")
                        st.write(summary)
                    else:
                        st.warning("No summary generated. The transcribed text may be too short or unclear.")
                else:
                    st.error("Failed to transcribe the audio. Please check the file format and try again.")

if __name__ == "__main__":
    main()