File size: 2,892 Bytes
22e8001
09131b8
45a8b80
4ee8f27
3ae41e8
b16f66b
4ee8f27
22e8001
 
 
 
 
 
 
 
2d161b2
 
45a8b80
481f3ca
45a8b80
 
b16f66b
 
3ae41e8
20d0c84
22e8001
 
 
 
1cb9b51
22e8001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa531c0
22e8001
bbd21ee
033d900
bbd21ee
09131b8
2d161b2
 
22e8001
09131b8
22e8001
ee10cd9
22e8001
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import gradio as gr
from langchain_community.document_loaders import YoutubeLoader
from langchain.text_splitter import TokenTextSplitter
from fastapi import FastAPI

max_textboxes = 5

def process_youtube_url(url, language):
    try:
        loader = YoutubeLoader.from_youtube_url(
            youtube_url=url,
            add_video_info=True,
            language=[language],
        )
        docs = loader.load()
        text = str(docs)
        char_count = len(text)

        text_splitter = TokenTextSplitter(chunk_size=10000, chunk_overlap=0)
        chunks = text_splitter.split_text(text)

        output_textboxes = [chunk for i, chunk in enumerate(chunks)]
        output_textboxes += ["" for _ in range(max_textboxes - len(chunks))]
        
        return *output_textboxes,[], text, char_count
    except Exception as e:
        error_msg = str(e)
        available_languages = extract_available_languages(error_msg)
        recommended_language = extract_recommended_language(error_msg)
        return *[error_msg for _ in range(max_textboxes)],available_languages, recommended_language, 0

def extract_available_languages(error_msg):
    languages = []
    generated_section = False
    for line in error_msg.split("\n"):
        if line.startswith("(GENERATED)"):
            generated_section = True
        elif generated_section and line.startswith(" - "):
            lang_code, lang_name = line[3:].split(" (", 1)
            languages.append(f"{lang_name[:-1]} ({lang_code})")
    return languages

def extract_recommended_language(error_msg):
    generated_section = False
    for line in error_msg.split("\n"):
        if line.startswith("(GENERATED)"):
            generated_section = True
        elif generated_section and line.startswith(" - ") and "[TRANSLATABLE]" in line:
            lang_code, lang_name = line[3:].split(" (", 1)
            return f"{lang_name[:-1]} ({lang_code})"
    return ""

iface = gr.Interface(
    fn=process_youtube_url,
    inputs=[
        gr.Textbox(label="YouTube URL", placeholder="https://youtu.be/example"),
        gr.Dropdown(label="Language",value="ja",choices=["en","en-US", "ja", "fr","de","it"],allow_custom_value=True),
    ],
    outputs=
        [gr.Textbox(label=f"chunk{ind}",show_copy_button=True) for ind in range(max_textboxes)]
    +[
        gr.Dropdown(label="Available Languages", allow_custom_value=True),
        gr.Textbox(label="Recommended Language"),
        gr.Number(label="Character Count")
    ],
    examples = [["https://youtu.be/6Af6b_wyiwI?si=zqD9-kjw24lpRJw3","ja"],["https://youtu.be/9kxL9Cf46VM?si=ADgUmDXb6riA-lgb","ja"]],
    title="YouTube Transcript Loader",
    description="Enter a YouTube URL and select the language to load the transcript using LangChain's YoutubeLoader.[buy me a coffee](https://www.buymeacoffee.com/regulusle04)",
)

if __name__ == "__main__":
    iface.launch()