Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import ffmpeg | |
| import os | |
| import pytube | |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
| import torch | |
| import numpy as np | |
| import wave | |
| # Load Wav2Vec 2.0 processor and model for transcription | |
| processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") | |
| model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") | |
| # Function to extract audio from video | |
| def extract_audio(video): | |
| if "youtube" in video: | |
| yt = pytube.YouTube(video) | |
| video_stream = yt.streams.filter(progressive=True, file_extension="mp4").first() | |
| video_stream.download(filename="temp_video.mp4") | |
| video_path = "temp_video.mp4" | |
| else: | |
| video_path = video.name | |
| video.save(video_path) | |
| # Extract audio using ffmpeg | |
| audio_path = "temp_audio.wav" | |
| ffmpeg.input(video_path).output(audio_path).run() | |
| return audio_path | |
| # Function to transcribe audio using Wav2Vec 2.0 | |
| def transcribe_audio(audio_path): | |
| with wave.open(audio_path, 'rb') as f: | |
| frames = f.readframes(f.getnframes()) | |
| waveform = np.frombuffer(frames, dtype=np.int16) | |
| input_values = processor(waveform, return_tensors="pt", padding=True).input_values | |
| logits = model(input_values).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = processor.decode(predicted_ids[0]) | |
| return transcription | |
| # Summarize the transcript using a Hugging Face summarization model | |
| from transformers import pipeline | |
| summarizer = pipeline("summarization") | |
| def summarize_text(text): | |
| summary = summarizer(text, max_length=200, min_length=50, do_sample=False) | |
| return summary[0]['summary_text'] | |
| # Gradio interface function | |
| def gradio_interface(video, question=None): | |
| audio_path = extract_audio(video) | |
| transcript = transcribe_audio(audio_path) | |
| summary = summarize_text(transcript) | |
| answer = None | |
| if question: | |
| answer = "This is where a Q&A feature would go." # Placeholder for Q&A | |
| return summary, answer if question else summary | |
| # Gradio Interface | |
| iface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=[ | |
| gr.Video(label="Upload Your Video or YouTube Link"), | |
| gr.Textbox(label="Ask a Question (Optional)", lines=2) | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Summary"), | |
| gr.Textbox(label="Answer (If Question Asked)") | |
| ], | |
| title="VideoGPT: AI Video Summarizer + Q&A", | |
| description="Upload a video or paste a YouTube link to extract the audio, get a summary, and ask questions about the video content." | |
| ) | |
| # Launch the interface | |
| iface.launch() | |