Spaces:
Build error
Build error
| import itertools | |
| import random | |
| import requests | |
| import pandas as pd | |
| import gradio as gr | |
| from pytube import YouTube | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| from youtube_transcript_api.formatters import TextFormatter | |
| def is_youtube_video_available(url): | |
| video = YouTube(url) | |
| try: | |
| video.title | |
| return True | |
| except: | |
| return False | |
| def get_example_videos(rr_examples_url, num_rr_examples): | |
| example_videos = [['https://www.youtube.com/watch?v=WfVF-Ec4naQ', 'https://www.youtube.com/watch?v=4hrNt28t7Cw'], | |
| ['https://www.youtube.com/watch?v=GbpjLP-UvIU', | |
| 'https://www.youtube.com/watch?v=BlQ2mP2EE4A'], | |
| ['https://www.youtube.com/watch?v=fdzY1f2P91k', | |
| 'https://www.youtube.com/watch?v=BlQ2mP2EE4A'], | |
| ['https://www.youtube.com/watch?v=fdzY1f2P91k', 'https://www.youtube.com/watch?v=9gIVGJQ3xWE']] | |
| example_videos = [ex for ex in example_videos if is_youtube_video_available( | |
| ex[0]) and is_youtube_video_available(ex[1])] | |
| try: | |
| example_videos_rr = requests.get(rr_examples_url).json() | |
| except: | |
| example_videos_rr = [] | |
| example_videos_rr = [[f'https://www.youtube.com/watch?v={ex["rejected_video_id"]}', | |
| f'https://www.youtube.com/watch?v={ex["recommendation_id"]}'] for ex in example_videos_rr] | |
| # remove duplicate video pairs, there seems to be one duplicate | |
| example_videos_rr.sort() | |
| example_videos_rr = list(example_videos_rr for example_videos_rr, | |
| _ in itertools.groupby(example_videos_rr)) | |
| example_videos_rr = [ex for ex in example_videos_rr if is_youtube_video_available( | |
| ex[0]) and is_youtube_video_available(ex[1])] | |
| if len(example_videos_rr) > num_rr_examples: | |
| example_videos_rr = random.sample(example_videos_rr, num_rr_examples) | |
| return example_videos, example_videos_rr | |
| def get_youtube_embedded_html(embed_url, video_position): | |
| return f''' | |
| <p>Video {video_position}</p> | |
| <iframe width="100%" height="360px" src="{embed_url}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; fullscreen" allowfullscreen></iframe> | |
| ''' | |
| def update_youtube_embedded_html(video_url, video_position): | |
| try: | |
| embed_url = YouTube(video_url).embed_url | |
| except: | |
| return f''' | |
| <p>There was error in fetching details for video with the URL: {video_url}</p> | |
| ''' | |
| return get_youtube_embedded_html(embed_url, video_position) | |
| def get_youtube_video_data(url): | |
| try: | |
| video = YouTube(url) | |
| except: | |
| raise gr.Error(f'Could not find YouTube video with the URL {url}') | |
| channel_id = video.channel_id | |
| video_title = video.title | |
| video_description = video.description | |
| try: | |
| transcript_list = YouTubeTranscriptApi.list_transcripts(video.video_id) | |
| except: | |
| return channel_id, video_title, video_description, None | |
| available_non_common_langs = [tr.language_code for tr in list( | |
| transcript_list) if tr.language_code not in ['en', 'en-US', 'es', 'de']] | |
| video_transcript = YouTubeTranscriptApi.get_transcript( | |
| video.video_id, languages=['en', 'en-US', 'es', 'de'] + available_non_common_langs) | |
| video_transcript = TextFormatter().format_transcript( | |
| video_transcript).replace('\n', ' ') | |
| return channel_id, video_title, video_description, video_transcript | |
| def get_input_data_df(video1_url, video2_url): | |
| channel_id, video_title, video_description, video_transcript = get_youtube_video_data( | |
| video1_url) | |
| channel_id2, video_title2, video_description2, video_transcript2 = get_youtube_video_data( | |
| video2_url) | |
| channel_sim = 1 if channel_id == channel_id2 else 0 | |
| df = pd.DataFrame([[video_title, video_description, video_transcript] + [video_title2, video_description2, video_transcript2] + [channel_sim]], columns=[ | |
| 'regret_title', 'regret_description', 'regret_transcript', 'recommendation_title', 'recommendation_description', 'recommendation_transcript', 'channel_sim']) | |
| return df | |