import gradio as gr import ffmpeg import os import pytube from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import torch import numpy as np import wave # Load Wav2Vec 2.0 processor and model for transcription processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") # Function to extract audio from video def extract_audio(video): if "youtube" in video: yt = pytube.YouTube(video) video_stream = yt.streams.filter(progressive=True, file_extension="mp4").first() video_stream.download(filename="temp_video.mp4") video_path = "temp_video.mp4" else: video_path = video.name video.save(video_path) # Extract audio using ffmpeg audio_path = "temp_audio.wav" ffmpeg.input(video_path).output(audio_path).run() return audio_path # Function to transcribe audio using Wav2Vec 2.0 def transcribe_audio(audio_path): with wave.open(audio_path, 'rb') as f: frames = f.readframes(f.getnframes()) waveform = np.frombuffer(frames, dtype=np.int16) input_values = processor(waveform, return_tensors="pt", padding=True).input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) return transcription # Summarize the transcript using a Hugging Face summarization model from transformers import pipeline summarizer = pipeline("summarization") def summarize_text(text): summary = summarizer(text, max_length=200, min_length=50, do_sample=False) return summary[0]['summary_text'] # Gradio interface function def gradio_interface(video, question=None): audio_path = extract_audio(video) transcript = transcribe_audio(audio_path) summary = summarize_text(transcript) answer = None if question: answer = "This is where a Q&A feature would go." # Placeholder for Q&A return summary, answer if question else summary # Gradio Interface iface = gr.Interface( fn=gradio_interface, inputs=[ gr.Video(label="Upload Your Video or YouTube Link"), gr.Textbox(label="Ask a Question (Optional)", lines=2) ], outputs=[ gr.Textbox(label="Summary"), gr.Textbox(label="Answer (If Question Asked)") ], title="VideoGPT: AI Video Summarizer + Q&A", description="Upload a video or paste a YouTube link to extract the audio, get a summary, and ask questions about the video content." ) # Launch the interface iface.launch()