Videostood / app.py
Nick021402's picture
Update app.py
c10f19e verified
import gradio as gr
import ffmpeg
import os
import pytube
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import numpy as np
import wave
# Load Wav2Vec 2.0 processor and model for transcription
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
# Function to extract audio from video
def extract_audio(video):
if "youtube" in video:
yt = pytube.YouTube(video)
video_stream = yt.streams.filter(progressive=True, file_extension="mp4").first()
video_stream.download(filename="temp_video.mp4")
video_path = "temp_video.mp4"
else:
video_path = video.name
video.save(video_path)
# Extract audio using ffmpeg
audio_path = "temp_audio.wav"
ffmpeg.input(video_path).output(audio_path).run()
return audio_path
# Function to transcribe audio using Wav2Vec 2.0
def transcribe_audio(audio_path):
with wave.open(audio_path, 'rb') as f:
frames = f.readframes(f.getnframes())
waveform = np.frombuffer(frames, dtype=np.int16)
input_values = processor(waveform, return_tensors="pt", padding=True).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
return transcription
# Summarize the transcript using a Hugging Face summarization model
from transformers import pipeline
summarizer = pipeline("summarization")
def summarize_text(text):
summary = summarizer(text, max_length=200, min_length=50, do_sample=False)
return summary[0]['summary_text']
# Gradio interface function
def gradio_interface(video, question=None):
audio_path = extract_audio(video)
transcript = transcribe_audio(audio_path)
summary = summarize_text(transcript)
answer = None
if question:
answer = "This is where a Q&A feature would go." # Placeholder for Q&A
return summary, answer if question else summary
# Gradio Interface
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Video(label="Upload Your Video or YouTube Link"),
gr.Textbox(label="Ask a Question (Optional)", lines=2)
],
outputs=[
gr.Textbox(label="Summary"),
gr.Textbox(label="Answer (If Question Asked)")
],
title="VideoGPT: AI Video Summarizer + Q&A",
description="Upload a video or paste a YouTube link to extract the audio, get a summary, and ask questions about the video content."
)
# Launch the interface
iface.launch()