File size: 4,852 Bytes
22e8001
09131b8
45a8b80
93f0416
8b1e0fa
3ae41e8
93f0416
31f6311
 
b16f66b
4ee8f27
049b4e5
22e8001
049b4e5
 
 
22e8001
 
 
 
 
 
2d161b2
a821a32
363b20b
 
a821a32
d9c827a
45a8b80
a821a32
b16f66b
 
17d83d8
 
93f0416
31f6311
9d8529b
 
 
 
 
 
 
 
 
7f17b94
 
93f0416
 
 
 
 
 
7f17b94
31f6311
 
93f0416
 
 
606db80
93f0416
 
 
31f6311
93f0416
 
 
31f6311
93f0416
 
 
 
 
3ae41e8
93f0416
a821a32
22e8001
 
 
 
17d83d8
a821a32
22e8001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa531c0
22e8001
bbd21ee
775c750
bbd21ee
09131b8
775c750
e76d38b
 
22e8001
9d28597
09131b8
22e8001
ee10cd9
22e8001
 
 
31f6311
06c8ee9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
from langchain_community.document_loaders import YoutubeLoader
from langchain.text_splitter import TokenTextSplitter
import anthropic
import os

client = anthropic.Anthropic(
    api_key=os.environ.get("api_key"),
)
max_textboxes = 5

def process_youtube_url(url="", language="en"):
    try:
        if url == "":
            return *["I'm waiting..." for _ in range(max_textboxes)], [], "", 0, ""
        # 以下の処理はそのまま
        loader = YoutubeLoader.from_youtube_url(
            youtube_url=url,
            add_video_info=True,
            language=[language],
        )
        docs = loader.load()
        text = str(docs)
        
        # embeddings = OpenAIEmbeddings()
        token_count = len(text)
        
        text_splitter = TokenTextSplitter(chunk_size=30_000, chunk_overlap=0)
        chunks = text_splitter.split_text(text)
        
        output_textboxes = [chunk for i, chunk in enumerate(chunks)]
        output_textboxes += ["" for _ in range(max_textboxes - len(chunks))]
        yield *output_textboxes, [], text, token_count,""
        
        with client.messages.stream(
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "あなたはだれ?"
                        }
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {
                            "type": "text",
                            "text": "わたしは日本語話者の解説系Youtuberです。"
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"lang:日本語 日本語で次のtranscriptを解説して。長くなってもいいよ\n\n## trascript \n```{text}```"
                        }
                    ]
                }
            ],
            system="lang:日本語 あなたは日本語話者の解説系Youtuberです。",
            model="claude-3-haiku-20240307",
            max_tokens=4096,
            temperature=0.7,
        ) as stream:
            summirizedtext = ""
            for text in stream.text_stream:
                summirizedtext += text
                # print(text, end="")
        
                yield *output_textboxes, [], text, token_count, summirizedtext
    
    except Exception as e:
        error_msg = str(e)
        available_languages = extract_available_languages(error_msg)
        recommended_language = extract_recommended_language(error_msg)
        return *[error_msg for _ in range(max_textboxes)], available_languages, recommended_language, 0,""


def extract_available_languages(error_msg):
    languages = []
    generated_section = False
    for line in error_msg.split("\n"):
        if line.startswith("(GENERATED)"):
            generated_section = True
        elif generated_section and line.startswith(" - "):
            lang_code, lang_name = line[3:].split(" (", 1)
            languages.append(f"{lang_name[:-1]} ({lang_code})")
    return languages

def extract_recommended_language(error_msg):
    generated_section = False
    for line in error_msg.split("\n"):
        if line.startswith("(GENERATED)"):
            generated_section = True
        elif generated_section and line.startswith(" - ") and "[TRANSLATABLE]" in line:
            lang_code, lang_name = line[3:].split(" (", 1)
            return f"{lang_name[:-1]} ({lang_code})"
    return ""

iface = gr.Interface(
    fn=process_youtube_url,
    inputs=[
        gr.Textbox(label="YouTube URL", placeholder="https://youtu.be/example"),
        gr.Dropdown(label="Language",value="ja",choices=["en","en-US", "ja", "fr","de","it"],allow_custom_value=True),
    ],
    outputs=
        [gr.Textbox(label=f"chunk{ind}",show_copy_button=True,max_lines=5) for ind in range(max_textboxes)]
    +[
        gr.Dropdown(label="Available Languages", allow_custom_value=True),
        gr.Textbox(label="Recommended Language",show_copy_button=True),
        gr.Number(label="Character Count"),
        gr.Markdown(label='summirized output'),
    ],
    live=True,
    examples = [["https://youtu.be/6Af6b_wyiwI?si=zqD9-kjw24lpRJw3","ja"],["https://youtu.be/9kxL9Cf46VM?si=ADgUmDXb6riA-lgb","ja"]],
    title="YouTube Transcript Loader",
    description="Enter a YouTube URL and select the language to load the transcript using LangChain's YoutubeLoader.[buy me a coffee](https://www.buymeacoffee.com/regulusle04)",
)

if __name__ == "__main__":
    iface.queue()
    iface.launch(share=True)