Spaces:
Running
Running
Yannael_LB
commited on
Commit
·
ea1af87
1
Parent(s):
eade312
Update
Browse files- app.py +131 -5
- requirements.txt +3 -0
app.py
CHANGED
|
@@ -1,9 +1,135 @@
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
return "Hello " + name + "!"
|
| 5 |
|
| 6 |
-
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
|
| 5 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
| 6 |
|
| 7 |
+
from openai import OpenAI
|
| 8 |
|
| 9 |
+
import numpy as np
|
| 10 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 11 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 12 |
+
|
| 13 |
+
def gradio_video_id_to_transcript(video_id):
|
| 14 |
+
|
| 15 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
|
| 16 |
+
transcript_formatted = [{'start': entry['start'], 'text': entry['text']} for entry in transcript[0:10]]
|
| 17 |
+
transcript_formatted_str = json.dumps(transcript_formatted, indent=2)+'...'
|
| 18 |
+
|
| 19 |
+
return {output_transcript: transcript_formatted_str,
|
| 20 |
+
gv_transcript: transcript}
|
| 21 |
+
|
| 22 |
+
def gradio_transcript_to_paragraphs(gv_transcript_value):
|
| 23 |
+
|
| 24 |
+
paragraphs, nb_input_tokens, nb_output_tokens, price = \
|
| 25 |
+
transcript_to_paragraphs(gv_transcript_value, openai_client, openai_model, chunk_size=5000)
|
| 26 |
+
|
| 27 |
+
paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...'
|
| 28 |
+
|
| 29 |
+
return {output_paragraphs: paragraphs_formatted_str,
|
| 30 |
+
gv_paragraphs: paragraphs}
|
| 31 |
+
|
| 32 |
+
def gradio_paragraphs_to_toc(gv_paragraphs_value):
|
| 33 |
+
|
| 34 |
+
paragraphs_dict = gv_paragraphs_value
|
| 35 |
+
|
| 36 |
+
json_toc, nb_input_tokens, nb_output_tokens, price = \
|
| 37 |
+
paragraphs_to_toc(paragraphs_dict, openai_client, openai_model, chunk_size=100)
|
| 38 |
+
|
| 39 |
+
json_toc_formatted_str = json.dumps(json_toc[0:4], indent=2)+'...'
|
| 40 |
+
|
| 41 |
+
return {output_toc: json_toc_formatted_str,
|
| 42 |
+
gv_toc: json_toc}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def gradio_get_paragraphs_timestamps(gv_transcript_value, gv_paragraphs_value):
|
| 46 |
+
|
| 47 |
+
paragraphs = add_timestamps_to_paragraphs(gv_transcript_value, gv_paragraphs_value, num_words=50)
|
| 48 |
+
|
| 49 |
+
paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...'
|
| 50 |
+
|
| 51 |
+
return {output_paragraphs_timestamps: paragraphs_formatted_str,
|
| 52 |
+
gv_paragraphs: paragraphs}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def gradio_get_chapters(gv_paragraphs_value, gv_toc_value):
|
| 56 |
+
|
| 57 |
+
chapters = get_chapters(gv_paragraphs_value, gv_toc_value)
|
| 58 |
+
|
| 59 |
+
chapters_formatted_str = json.dumps(chapters[0:4], indent=2)+'...'
|
| 60 |
+
|
| 61 |
+
return {output_chapters: chapters_formatted_str,
|
| 62 |
+
gv_chapters: chapters}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def gradio_get_markdown(gv_chapters_value):
|
| 66 |
+
|
| 67 |
+
markdown = chapters_to_markdown(gv_chapters_value)
|
| 68 |
+
|
| 69 |
+
return markdown
|
| 70 |
+
|
| 71 |
+
with gr.Blocks() as app:
|
| 72 |
+
|
| 73 |
+
gr.Markdown("## Get transcript")
|
| 74 |
+
|
| 75 |
+
gv_transcript = gr.State()
|
| 76 |
+
video_id_input = gr.Textbox(label="Video ID", value = "ErnWZxJovaM")
|
| 77 |
+
get_transcript_button = gr.Button("Get transcript")
|
| 78 |
+
output_transcript = gr.Textbox(label = "Transcript (JSON format - start, text)")
|
| 79 |
+
|
| 80 |
+
get_transcript_button.click(gradio_video_id_to_transcript,
|
| 81 |
+
inputs=[video_id_input],
|
| 82 |
+
outputs=[output_transcript, gv_transcript])
|
| 83 |
+
|
| 84 |
+
gr.Markdown("## Transcript to paragraphs")
|
| 85 |
+
|
| 86 |
+
gv_paragraphs = gr.State()
|
| 87 |
+
get_paragraphs_button = gr.Button("Get paragraphs")
|
| 88 |
+
output_paragraphs = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text)")
|
| 89 |
+
|
| 90 |
+
get_paragraphs_button.click(gradio_transcript_to_paragraphs,
|
| 91 |
+
inputs=[gv_transcript],
|
| 92 |
+
outputs=[output_paragraphs, gv_paragraphs])
|
| 93 |
+
|
| 94 |
+
gr.Markdown("## Get table of content")
|
| 95 |
+
|
| 96 |
+
gv_toc = gr.State()
|
| 97 |
+
get_toc_button = gr.Button("Get table of contents")
|
| 98 |
+
output_toc = gr.Textbox(label = "Table of content (JSON format - paragraph_number, title)")
|
| 99 |
+
|
| 100 |
+
get_toc_button.click(gradio_paragraphs_to_toc,
|
| 101 |
+
inputs=[gv_paragraphs],
|
| 102 |
+
outputs=[output_toc, gv_toc])
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
gr.Markdown("## Infer paragraph timestamps with TF-IDF")
|
| 106 |
+
|
| 107 |
+
get_timestamps_button = gr.Button("Infer paragraph timestamps")
|
| 108 |
+
output_paragraphs_timestamps = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text, start)")
|
| 109 |
+
|
| 110 |
+
get_timestamps_button.click(gradio_get_paragraphs_timestamps,
|
| 111 |
+
inputs=[gv_transcript, gv_paragraphs],
|
| 112 |
+
outputs=[output_paragraphs_timestamps, gv_paragraphs])
|
| 113 |
+
|
| 114 |
+
gr.Markdown("## Get chapters")
|
| 115 |
+
|
| 116 |
+
gv_chapters = gr.State()
|
| 117 |
+
get_chapters_button = gr.Button("Get chapters")
|
| 118 |
+
output_chapters = gr.Textbox(label = "Chapters (JSON format)")
|
| 119 |
+
|
| 120 |
+
get_chapters_button.click(gradio_get_chapters,
|
| 121 |
+
inputs=[gv_paragraphs, gv_toc],
|
| 122 |
+
outputs=[output_chapters, gv_chapters])
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
gr.Markdown("## Markdown formatting")
|
| 126 |
+
|
| 127 |
+
get_markdown_button = gr.Button("Markdown formatting")
|
| 128 |
+
output_markdown = gr.Markdown(label = "Chapters (Markdown format)")
|
| 129 |
+
|
| 130 |
+
get_markdown_button.click(gradio_get_markdown,
|
| 131 |
+
inputs=[gv_chapters],
|
| 132 |
+
outputs=[output_markdown])
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
app.launch(debug=True)
|
requirements.txt
CHANGED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
youtube-transcript-api
|
| 2 |
+
openai
|
| 3 |
+
gradio
|