File size: 4,139 Bytes
22e8001
09131b8
45a8b80
31f6311
8b1e0fa
3ae41e8
31f6311
 
 
b16f66b
4ee8f27
22e8001
 
 
 
 
 
 
 
2d161b2
a821a32
363b20b
 
a821a32
d9c827a
45a8b80
a821a32
b16f66b
 
31f6311
 
 
 
 
 
a5c5bd4
31f6311
 
 
a5c5bd4
31f6311
f193ea1
 
 
 
31f6311
 
 
 
 
 
 
 
 
 
 
3ae41e8
31f6311
a821a32
22e8001
 
 
 
9b7e8f2
a821a32
22e8001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa531c0
22e8001
bbd21ee
775c750
bbd21ee
09131b8
775c750
e76d38b
 
22e8001
9d28597
09131b8
22e8001
ee10cd9
22e8001
 
 
31f6311
06c8ee9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr
from langchain_community.document_loaders import YoutubeLoader
from langchain.text_splitter import TokenTextSplitter
from groq import Groq
import os

client = Groq(
    api_key=os.environ.get("api_key"),
)
max_textboxes = 5

def process_youtube_url(url, language):
    try:
        loader = YoutubeLoader.from_youtube_url(
            youtube_url=url,
            add_video_info=True,
            language=[language],
        )
        docs = loader.load()
        text = str(docs)
        
        # embeddings = OpenAIEmbeddings()
        token_count = len(text)
        
        text_splitter = TokenTextSplitter(chunk_size=30_000, chunk_overlap=0)
        chunks = text_splitter.split_text(text)
        
        output_textboxes = [chunk for i, chunk in enumerate(chunks)]
        output_textboxes += ["" for _ in range(max_textboxes - len(chunks))]

        completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[
                {
                    "role": "system",
                    "content": "lang:日本語 あなたは日本語話者の解説系Youtuberです。"
                },
                {
                    "role": "user",
                    "content": f"lang:日本語 日本語で次のtranscriptを解説して\n\n## trascript \n{text}"
                },
                {
                    "role": "assisnat",
                    "content": f"この動画は、"
                },
            ],
            temperature=0.7,
            max_tokens=1024,
            top_p=1,
            stream=True,
            stop=None,
        )
        summirizedtext = ""
        for chunk in completion:
            summirizedtext += chunk.choices[0].delta.content or ""
            # print(chunk.choices[0].delta.content or "", end="")
        
            yield *output_textboxes, [], text, token_count,summirizedtext
    
    except Exception as e:
        error_msg = str(e)
        available_languages = extract_available_languages(error_msg)
        recommended_language = extract_recommended_language(error_msg)
        yield *[error_msg for _ in range(max_textboxes)], available_languages, recommended_language, 0,""


def extract_available_languages(error_msg):
    languages = []
    generated_section = False
    for line in error_msg.split("\n"):
        if line.startswith("(GENERATED)"):
            generated_section = True
        elif generated_section and line.startswith(" - "):
            lang_code, lang_name = line[3:].split(" (", 1)
            languages.append(f"{lang_name[:-1]} ({lang_code})")
    return languages

def extract_recommended_language(error_msg):
    generated_section = False
    for line in error_msg.split("\n"):
        if line.startswith("(GENERATED)"):
            generated_section = True
        elif generated_section and line.startswith(" - ") and "[TRANSLATABLE]" in line:
            lang_code, lang_name = line[3:].split(" (", 1)
            return f"{lang_name[:-1]} ({lang_code})"
    return ""

iface = gr.Interface(
    fn=process_youtube_url,
    inputs=[
        gr.Textbox(label="YouTube URL", placeholder="https://youtu.be/example"),
        gr.Dropdown(label="Language",value="ja",choices=["en","en-US", "ja", "fr","de","it"],allow_custom_value=True),
    ],
    outputs=
        [gr.Textbox(label=f"chunk{ind}",show_copy_button=True,max_lines=5) for ind in range(max_textboxes)]
    +[
        gr.Dropdown(label="Available Languages", allow_custom_value=True),
        gr.Textbox(label="Recommended Language",show_copy_button=True),
        gr.Number(label="Character Count"),
        gr.Markdown(label='summirized output'),
    ],
    live=True,
    examples = [["https://youtu.be/6Af6b_wyiwI?si=zqD9-kjw24lpRJw3","ja"],["https://youtu.be/9kxL9Cf46VM?si=ADgUmDXb6riA-lgb","ja"]],
    title="YouTube Transcript Loader",
    description="Enter a YouTube URL and select the language to load the transcript using LangChain's YoutubeLoader.[buy me a coffee](https://www.buymeacoffee.com/regulusle04)",
)

if __name__ == "__main__":
    iface.queue()
    iface.launch(share=True)