Uzairabbasi commited on
Commit
1b6c88c
·
verified ·
1 Parent(s): c597daf

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +116 -0
  3. input.mp4 +3 -0
  4. requirements.txt +6 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ input.mp4 filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM
4
+ from pydub import AudioSegment
5
+ import librosa
6
+ import ffmpeg
7
+ import os
8
+ import re
9
+ import tempfile
10
+
11
+ @st.cache_resource
12
+ def load_model():
13
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+ processor = AutoProcessor.from_pretrained("openai/whisper-medium")
15
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-medium").to(device).half()
16
+ summarizer_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
17
+ summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device)
18
+ return processor, model, summarizer_tokenizer, summarizer_model, device
19
+
20
+ def extract_audio(video_path, output_audio_path):
21
+ if not os.path.exists(video_path):
22
+ raise FileNotFoundError(f"Video file not found: {video_path}")
23
+ try:
24
+ (ffmpeg.input(video_path).output(output_audio_path, ac=1, ar=16000, preset="ultrafast").overwrite_output().run(quiet=True))
25
+ except ffmpeg.Error as e:
26
+ raise RuntimeError(f"FFmpeg error: {e.stderr.decode()}")
27
+
28
+ def split_audio(audio_path, chunk_duration_ms=5000):
29
+ audio = AudioSegment.from_file(audio_path)
30
+ chunks = [audio[i:i + chunk_duration_ms] for i in range(0, len(audio), chunk_duration_ms)]
31
+ return chunks
32
+
33
+ def transcribe_in_batches(chunks, processor, model, device, progress_bar, batch_size=4):
34
+ transcriptions = []
35
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
36
+ total_batches = len(range(0, len(chunks), batch_size))
37
+ for i in range(0, len(chunks), batch_size):
38
+ batch = chunks[i:i + batch_size]
39
+ batch_features = []
40
+ temp_files = []
41
+ for idx, chunk in enumerate(batch):
42
+ temp_audio_path = f"temp_chunk_{i+idx}.wav"
43
+ chunk.export(temp_audio_path, format="wav")
44
+ temp_files.append(temp_audio_path)
45
+ audio, sr = librosa.load(temp_audio_path, sr=16000)
46
+ inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
47
+ input_features = inputs.input_features.to(device).half()
48
+ batch_features.append(input_features)
49
+ input_features = torch.cat(batch_features).to(device)
50
+ with torch.no_grad():
51
+ generated_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
52
+ transcriptions += processor.batch_decode(generated_ids, skip_special_tokens=True)
53
+ for file in temp_files:
54
+ os.remove(file)
55
+ progress_bar.progress((i + batch_size) / len(chunks))
56
+ return transcriptions
57
+
58
+ def combine_transcriptions(transcriptions):
59
+ return "\n".join(transcriptions)
60
+
61
+ def remove_timecodes(text):
62
+ return re.sub(r'\[.*?\]', '', text)
63
+
64
+ def summarize_text(text, tokenizer, model, device):
65
+ text = text.encode('utf-8', 'ignore').decode()
66
+ inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True).to(device)
67
+ with torch.no_grad():
68
+ summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=50, max_length=200, early_stopping=True)
69
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
70
+
71
+ st.title("Video Transcription and Summarization")
72
+ st.write("Upload a video file to generate transcription and summary")
73
+ uploaded_file = st.file_uploader("Choose a video file", type=['mp4', 'avi', 'mov'])
74
+
75
+ if uploaded_file is not None:
76
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_video:
77
+ tmp_video.write(uploaded_file.read())
78
+ video_path = tmp_video.name
79
+
80
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_audio:
81
+ audio_path = tmp_audio.name
82
+
83
+ try:
84
+ with st.spinner("Loading models..."):
85
+ processor, model, summarizer_tokenizer, summarizer_model, device = load_model()
86
+
87
+ with st.spinner("Extracting audio..."):
88
+ extract_audio(video_path, audio_path)
89
+
90
+ chunks = split_audio(audio_path)
91
+ progress_bar = st.progress(0)
92
+ st.write("Transcribing audio...")
93
+ transcriptions = transcribe_in_batches(chunks, processor, model, device, progress_bar)
94
+ full_transcription = combine_transcriptions(transcriptions)
95
+
96
+ st.subheader("Transcription")
97
+ st.text_area("Full transcription", full_transcription, height=200)
98
+
99
+ clean_transcription = remove_timecodes(full_transcription)
100
+ with st.spinner("Generating summary..."):
101
+ summary = summarize_text(clean_transcription, summarizer_tokenizer, summarizer_model, device)
102
+
103
+ st.subheader("Summary")
104
+ st.text_area("Text summary", summary, height=100)
105
+
106
+ col1, col2 = st.columns(2)
107
+ with col1:
108
+ st.download_button("Download Transcription", full_transcription, "transcription.txt")
109
+ with col2:
110
+ st.download_button("Download Summary", summary, "summary.txt")
111
+
112
+ except Exception as e:
113
+ st.error(f"An error occurred: {e}")
114
+ finally:
115
+ os.unlink(video_path)
116
+ os.unlink(audio_path)
input.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bba3d6bd0287fcfcdf98fd007226ddb006f67c5e8f44197964322aab49a089eb
3
+ size 749762
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pydub
2
+ moviepy
3
+ transformers
4
+ librosa
5
+ ffmpeg-python
6
+ langdetect