File size: 2,384 Bytes
4f3e3b2
0d79624
4f3e3b2
0d79624
 
d3fc250
 
 
 
 
 
 
 
 
72b8a24
d3fc250
 
 
 
 
 
 
 
 
 
 
 
 
4f3e3b2
 
 
 
0d79624
4f3e3b2
 
 
0d79624
4f3e3b2
 
 
 
 
0d79624
4f3e3b2
 
0d79624
4f3e3b2
 
 
 
0d79624
d3fc250
 
 
 
 
 
 
 
 
 
 
4f3e3b2
0d79624
4f3e3b2
 
 
 
 
 
9b16097
943e714
b3af828
6b0ed86
4f3e3b2
0d79624
d3fc250
 
4f3e3b2
28ae61c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pytube
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import gradio as gr

import os
import uuid
import joblib
import json

from huggingface_hub import CommitScheduler
from pathlib import Path


# Prepare the logging functionality

log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"
log_folder = log_file.parent

scheduler = CommitScheduler(
    repo_id="YouTubeSummarizer-log",
    repo_type="dataset",
    folder_path=log_folder,
    path_in_repo="data",
    every=2
)


# Load the Hugging Face model and tokenizer
model_name = "sshleifer/distilbart-cnn-12-6"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def get_transcript(youtube_url):
    # Extract the video ID from the YouTube URL
    video_id = pytube.extract.video_id(youtube_url)

    # Get the transcript using the YouTube Transcript API
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
    except Exception as e:
        return f"Error retrieving transcript: {str(e)}"

    # Join the transcript segments into a single string
    transcript_text = " ".join([segment["text"] for segment in transcript_list])

    # Summarize the transcript text using the Hugging Face model
    inputs = tokenizer(transcript_text, return_tensors="pt", truncation=True, padding="longest")
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=100, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


    with scheduler.lock:
        with log_file.open("a") as f:
            f.write(json.dumps(
                {
                    'YouTube URL': youtube_url,
                    'Summary': summary
                }
            ))
            f.write("\n")

    return summary

# Create a Gradio interface
iface = gr.Interface(
    fn=get_transcript,
    inputs="text",
    outputs="text",
    title="@IT AI Enthusiast (Mayank Chugh) (https://www.youtube.com/@itaienthusiast/) - Project 2: YouTube Video Transcript Generator",
    description="Enter a YouTube URL to generate and summarize the video transcript.",
    examples=['https://www.youtube.com/watch?v=0vK7AwUpRvY'],
    theme=gr.themes.Glass(),
    concurrency_limit=8
)



# Launch the Gradio interface
iface.launch(share=False)