mayankchugh-learning commited on
Commit
4f3e3b2
·
verified ·
1 Parent(s): 24e28da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -41
app.py CHANGED
@@ -1,51 +1,41 @@
1
- import re
2
  from youtube_transcript_api import YouTubeTranscriptApi
3
- from youtube_transcript_api.formatters import TextFormatter
4
- import torch
5
  import gradio as gr
6
- from transformers import pipeline
7
 
8
- text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.bfloat16)
 
 
 
9
 
10
- def summary (input):
11
- output = text_summary(input)
12
- return output[0]['summary_text']
13
 
14
- def extract_video_id(url):
15
- # Regex to extract the video ID from various YouTube URL formats
16
- regex = r"(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
17
- match = re.search(regex, url)
18
- if match:
19
- return match.group(1)
20
- return None
21
 
 
 
22
 
23
- def get_youtube_transcript(video_url):
24
- video_id = extract_video_id(video_url)
25
- if not video_id:
26
- return "Video ID could not be extracted."
27
 
28
- try:
29
- # Fetch the transcript
30
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
31
 
32
- # Format the transcript into plain text
33
- formatter = TextFormatter()
34
- text_transcript = formatter.format_transcript(transcript)
35
- summary_text = summary(text_transcript)
 
 
 
 
36
 
37
- return summary_text
38
- except Exception as e:
39
- return f"An error occurred: {e}"
40
-
41
- gr.close_all()
42
-
43
- # demo = gr.Interface(fn=summary, inputs="text",outputs="text")
44
- demo = gr.Interface(fn=get_youtube_transcript,
45
- inputs=[gr.Textbox(label="Input YouTube Url to summarize",lines=1)],
46
- outputs=[gr.Textbox(label="Summarized text",lines=4)],
47
- title="@IT AI Enthusiast (https://www.youtube.com/@itaienthusiast/) - Project 2: YouTube Script Summarizer",
48
- description="THIS APPLICATION WILL BE USED TO SUMMARIZE THE YOUTUBE VIDEO SCRIPT.",
49
- examples=['https://www.youtube.com/watch?v=tQb7bumjkIM'],
50
- concurrency_limit=8)
51
- demo.launch()
 
1
+ import pytube
2
  from youtube_transcript_api import YouTubeTranscriptApi
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 
4
  import gradio as gr
 
5
 
6
+ # Load the Hugging Face model and tokenizer
7
+ model_name = "sshleifer/distilbart-cnn-12-6"
8
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
 
11
+ def get_transcript(youtube_url):
12
+ # Extract the video ID from the YouTube URL
13
+ video_id = pytube.extract.video_id(youtube_url)
14
 
15
+ # Get the transcript using the YouTube Transcript API
16
+ try:
17
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
18
+ except Exception as e:
19
+ return f"Error retrieving transcript: {str(e)}"
 
 
20
 
21
+ # Join the transcript segments into a single string
22
+ transcript_text = " ".join([segment["text"] for segment in transcript_list])
23
 
24
+ # Summarize the transcript text using the Hugging Face model
25
+ inputs = tokenizer(transcript_text, return_tensors="pt", truncation=True, padding="longest")
26
+ summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=100, early_stopping=True)
27
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
28
 
29
+ return summary
 
 
30
 
31
+ # Create a Gradio interface
32
+ iface = gr.Interface(
33
+ fn=get_transcript,
34
+ inputs="text",
35
+ outputs="text",
36
+ title="@IT AI Enthusiast (Mayank Chugh) (https://www.youtube.com/@itaienthusiast/) - Project 2: YouTube Video Transcript Generator",
37
+ description="Enter a YouTube URL to generate and summarize the video transcript."
38
+ )
39
 
40
+ # Launch the Gradio interface
41
+ iface.launch()