File size: 2,384 Bytes
4f3e3b2 0d79624 4f3e3b2 0d79624 d3fc250 72b8a24 d3fc250 4f3e3b2 0d79624 4f3e3b2 0d79624 4f3e3b2 0d79624 4f3e3b2 0d79624 4f3e3b2 0d79624 d3fc250 4f3e3b2 0d79624 4f3e3b2 9b16097 943e714 b3af828 6b0ed86 4f3e3b2 0d79624 d3fc250 4f3e3b2 28ae61c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import pytube
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import gradio as gr
import os
import uuid
import joblib
import json
from huggingface_hub import CommitScheduler
from pathlib import Path
# Prepare the logging functionality
log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"
log_folder = log_file.parent
scheduler = CommitScheduler(
repo_id="YouTubeSummarizer-log",
repo_type="dataset",
folder_path=log_folder,
path_in_repo="data",
every=2
)
# Load the Hugging Face model and tokenizer
model_name = "sshleifer/distilbart-cnn-12-6"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def get_transcript(youtube_url):
# Extract the video ID from the YouTube URL
video_id = pytube.extract.video_id(youtube_url)
# Get the transcript using the YouTube Transcript API
try:
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
except Exception as e:
return f"Error retrieving transcript: {str(e)}"
# Join the transcript segments into a single string
transcript_text = " ".join([segment["text"] for segment in transcript_list])
# Summarize the transcript text using the Hugging Face model
inputs = tokenizer(transcript_text, return_tensors="pt", truncation=True, padding="longest")
summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=100, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
with scheduler.lock:
with log_file.open("a") as f:
f.write(json.dumps(
{
'YouTube URL': youtube_url,
'Summary': summary
}
))
f.write("\n")
return summary
# Create a Gradio interface
iface = gr.Interface(
fn=get_transcript,
inputs="text",
outputs="text",
title="@IT AI Enthusiast (Mayank Chugh) (https://www.youtube.com/@itaienthusiast/) - Project 2: YouTube Video Transcript Generator",
description="Enter a YouTube URL to generate and summarize the video transcript.",
examples=['https://www.youtube.com/watch?v=0vK7AwUpRvY'],
theme=gr.themes.Glass(),
concurrency_limit=8
)
# Launch the Gradio interface
iface.launch(share=False) |