Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,212 +1,71 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
from gradio_client import Client
|
| 3 |
-
import json
|
| 4 |
-
import logging
|
| 5 |
-
import openai
|
| 6 |
import os
|
|
|
|
|
|
|
|
|
|
| 7 |
import re
|
| 8 |
-
import html
|
| 9 |
-
|
| 10 |
-
# ๋ก๊น
์ค์
|
| 11 |
-
logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
|
| 12 |
-
format='%(asctime)s - %(levelname)s - %(message)s')
|
| 13 |
-
|
| 14 |
-
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 15 |
-
|
| 16 |
-
def parse_api_response(response):
|
| 17 |
-
try:
|
| 18 |
-
if isinstance(response, str):
|
| 19 |
-
response = json.loads(response)
|
| 20 |
-
if isinstance(response, list) and len(response) > 0:
|
| 21 |
-
response = response[0]
|
| 22 |
-
if not isinstance(response, dict):
|
| 23 |
-
raise ValueError(f"์์์น ๋ชปํ ์๋ต ํ์์
๋๋ค. ๋ฐ์ ๋ฐ์ดํฐ ํ์
: {type(response)}")
|
| 24 |
-
return response
|
| 25 |
-
except Exception as e:
|
| 26 |
-
logging.error(f"API ์๋ต ํ์ฑ ์คํจ: {str(e)}")
|
| 27 |
-
raise ValueError(f"API ์๋ต ํ์ฑ ์คํจ: {str(e)}")
|
| 28 |
-
|
| 29 |
-
def get_youtube_script(url):
|
| 30 |
-
logging.info(f"์คํฌ๋ฆฝํธ ์ถ์ถ ์์: URL = {url}")
|
| 31 |
-
client = Client("whispersound/YT_Ts_R")
|
| 32 |
-
try:
|
| 33 |
-
result = client.predict(youtube_url=url, api_name="/predict")
|
| 34 |
-
parsed_result = parse_api_response(result)
|
| 35 |
-
|
| 36 |
-
if 'data' not in parsed_result or not parsed_result['data']:
|
| 37 |
-
raise ValueError("API ์๋ต์ ์ ํจํ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.")
|
| 38 |
-
|
| 39 |
-
data = parsed_result["data"][0]
|
| 40 |
-
title = data.get("title", "์ ๋ชฉ ์์")
|
| 41 |
-
description = data.get("description", "์ค๋ช
์์")
|
| 42 |
-
transcription_text = data.get("transcriptionAsText", "")
|
| 43 |
-
thumbnails = data.get("thumbnails", [])
|
| 44 |
-
|
| 45 |
-
if not transcription_text:
|
| 46 |
-
raise ValueError("์ถ์ถ๋ ์คํฌ๋ฆฝํธ๊ฐ ์์ต๋๋ค.")
|
| 47 |
-
|
| 48 |
-
logging.info("์คํฌ๋ฆฝํธ ์ถ์ถ ์๋ฃ")
|
| 49 |
-
return title, description, transcription_text, thumbnails
|
| 50 |
-
except Exception as e:
|
| 51 |
-
logging.exception("์คํฌ๋ฆฝํธ ์ถ์ถ ์ค ์ค๋ฅ ๋ฐ์")
|
| 52 |
-
raise
|
| 53 |
-
|
| 54 |
-
def call_api(prompt, max_tokens, temperature, top_p):
|
| 55 |
-
try:
|
| 56 |
-
response = openai.ChatCompletion.create(
|
| 57 |
-
model="gpt-4o-mini",
|
| 58 |
-
messages=[{"role": "user", "content": prompt}],
|
| 59 |
-
max_tokens=max_tokens,
|
| 60 |
-
temperature=temperature,
|
| 61 |
-
top_p=top_p
|
| 62 |
-
)
|
| 63 |
-
return response['choices'][0]['message']['content']
|
| 64 |
-
except Exception as e:
|
| 65 |
-
logging.exception("LLM API ํธ์ถ ์ค ์ค๋ฅ ๋ฐ์")
|
| 66 |
-
raise
|
| 67 |
-
|
| 68 |
-
def summarize_text(title, description, text):
|
| 69 |
-
prompt = f"""
|
| 70 |
-
[์ ํ๋ธ ์์ฝ ๊ท์น]
|
| 71 |
-
1. ๋๋ ์ ํ๋ธ ์์ ์ ๋ฌธ ํด์ค๊ฐ๋ก์ ์ง์นจ์ ๋ง๊ฒ ์ด ๊ธ์ ์์ฑํ๋ผ
|
| 72 |
-
2. ์๋์ ์ ๋ชฉ๊ณผ ์ค๋ช
์ ์ด ์ ํ๋ธ ์์์ ์๋ณธ ๋ฉํ๋ฐ์ดํฐ์ด๋ค.
|
| 73 |
-
3. ๋ฐ๋์ ์ ๋ชฉ๊ณผ ์ค๋ช
์ผ๋ก ์ฃผ์ ์ ๋ฌธ๋งฅ, ์ฒ ์(Spelling)์ ๋จผ์ ํ์
ํ๊ณ , ์๋์ ๋๋ณธ์ ๋ฐ๋์ ์ง์นจ์ ๋ง๊ฒ ์์ธํ๊ฒ ์์ฝํ๋ผ
|
| 74 |
-
- ๋ฐ๋์ ์ฃผ์ด์ง ์ ๋ชฉ, ์ค๋ช
์ ์๋ ์ฒ ์(Spelling)๋ฅผ ์์ฝ์ ๋ฐ์ํ๋ผ(์๋ฌธ ๋๋ณธ์๋ ์คํ์๊ฐ ์์ ์ ์๋ค)
|
| 75 |
-
4. ๋ฐ๋์ ํ๊ธ๋ก ์์ฑํ๋ผ
|
| 76 |
-
5. ๋ฐ๋์ '์ด ์ ํ๋ธ ๋๋ณธ์', '์ด ์์์', '์ด ์ ํ๋ธ๋'๋ฑ์ ์๊ฐ์ ํํ์ ์ ์ธํ๋ผ
|
| 77 |
-
6. ์์ฝ๋ฌธ๋ง์ผ๋ก๋ ์์์ ์ง์ ์์ฒญํ ๊ฒ๊ณผ ๋์ผํ ์์ค์ผ๋ก ๋ด์ฉ์ ์ดํดํ ์ ์๋๋ก ์์ธํ ์์ฑ
|
| 78 |
-
7. ๊ธ์ ๋๋ฌด ์์ถํ๊ฑฐ๋ ํจ์ถํ์ง ๋ง๊ณ , ์ค์ํ ๋ด์ฉ๊ณผ ์ธ๋ถ์ฌํญ์ ๋ชจ๋ ํฌํจ
|
| 79 |
-
8. ๋ฐ๋์ ๋๋ณธ์ ํ๋ฆ๊ณผ ๋
ผ๋ฆฌ ๊ตฌ์กฐ๋ฅผ ์ ์ง
|
| 80 |
-
9. ๋๋ณธ์ ๋ชฉ์ ์ด๋ ์๋๋ฅผ ํ์
ํ๊ณ , ์ด๋ฅผ ์์ฝ์ ๋ฐ๋์ ๋ฐ์
|
| 81 |
-
10. ๋ฐ๋์ ์๊ฐ ์์๋ ์ฌ๊ฑด์ ์ ๊ฐ ๊ณผ์ ์ ๋ช
ํํ๊ฒ ๋ฐ์
|
| 82 |
-
11. ๋ฑ์ฅ์ธ๋ฌผ, ์ฅ์, ์ฌ๊ฑด ๋ฑ ์ค์ํ ์์๋ฅผ ์ ํํ๊ฒ ์์ฑ
|
| 83 |
-
12. ๋๋ณธ์์ ์ ๋ฌํ๋ ๊ฐ์ ์ด๋ ๋ถ์๊ธฐ๋ ํฌํจ
|
| 84 |
-
13. ๋ฐ๋์ ๊ธฐ์ ์ ์ฉ์ด๋ ์ ๋ฌธ ์ฉ์ด๊ฐ ์์ ๊ฒฝ์ฐ, ์ด๋ฅผ ์ ํํ๊ฒ ์ฌ์ฉ
|
| 85 |
-
14. ๋ฐ๋์ ํต์ฌ ์น์
(์์ฃผ์ )๋ฅผ ํ์
ํ์ฌ ์น์
์ ๋ง๊ฒ ๊ธ์ ์์ฝํ๋ผ(๊ธ์ ์์ ๊ณ ๋ คํ์ฌ ์น์
์ ๊ฐ์๋ฅผ ํ๋ ฅ์ ์ผ๋ก ์ค์ )
|
| 86 |
-
15. ๊ฐ ์น์
์ ์ ๋ชฉ(์์ฃผ์ )์๋ ๋ด์ฉ๊ณผ ์ด์ธ๋ฆฌ๋ ์ ์ ํ ์ด๋ชจ์ง๋ก ์์ฃผ์ ๋ฅผ ์์ํ๋ผ
|
| 87 |
-
16. ๊ฐ ์น์
์ ๋ด์ฉ์ Bullet Point๋ฅผ ์ฌ์ฉํ์ฌ ๊ฐ๋
์ฑ์ ๋์ฌ๋ผ(๋ฌธ์ฅ ๋จ์๋ก ๊ตฌ๋ถ)
|
| 88 |
-
[์์]
|
| 89 |
-
(๋ณ๊ฒฝ์ )
|
| 90 |
-
- ์ ํ๋ธ๋ฅผ ์ฒ์ ์์ํ๋ ์ฌ๋๋ค์ ๊ตฌ๋
์ ์์ ์กฐํ์์ ํฐ ๊ด์ฌ์ ๋๊ณ ๋งค์ผ ์ ํ๋ธ ์คํ๋์ค๋ฅผ ํ์ธํ๊ฒ ๋๋ค. ๊ทธ๋ฌ๋ ๊ตฌ๋
์๊ฐ 100๋ช
, 1,000๋ช
์ ๋๋ฌํ๋ ๊ฒ๋ง์ผ๋ก๋ ์ง์์ ์ธ ์ฑ์ฅ์ ๋์์ด ๋์ง ์๋๋ค. ๊ตฌ๋
์ ์๊ฐ ๋์ด๋ ํ์๋ ์ ํ๋ธ ์ฑ๋ ์ด์์ ๋ํ ๊ฐ์ ์ก์ง ๋ชปํด ํฌ๊ธฐํ๋ ๊ฒฝ์ฐ๊ฐ ๋ง๋ค.
|
| 91 |
-
(๋ณ๊ฒฝํ)
|
| 92 |
-
- ์ ํ๋ธ๋ฅผ ์ฒ์ ์์ํ๋ ์ฌ๋๋ค์ ๊ตฌ๋
์ ์์ ์กฐํ์์ ํฐ ๊ด์ฌ์ ๋๊ณ ๋งค์ผ ์ ํ๋ธ ์คํ๋์ค๋ฅผ ํ์ธํ๊ฒ ๋๋ค.
|
| 93 |
-
- ๊ทธ๋ฌ๋ ๊ตฌ๋
์๊ฐ 100๋ช
, 1,000๋ช
์ ๋๋ฌํ๋ ๊ฒ๋ง์ผ๋ก๋ ์ง์์ ์ธ ์ฑ์ฅ์ ๋์์ด ๋์ง ์๋๋ค.
|
| 94 |
-
- ๊ตฌ๋
์ ์๊ฐ ๋์ด๋ ๏ฟฝ๏ฟฝ์๋ ์ ํ๋ธ ์ฑ๋ ์ด์์ ๋ํ ๊ฐ์ ์ก์ง ๋ชปํด ํฌ๊ธฐํ๋ ๊ฒฝ์ฐ๊ฐ ๋ง๋ค.
|
| 95 |
-
17. ๊ฐ ์น์
์ ๋ด์ฉ์ ๋ฐ๋์ ์ถฉ์คํ๊ฒ ์์ฑ
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
current_sentence = sentence.strip()
|
| 117 |
-
else:
|
| 118 |
-
current_sentence += sentence
|
| 119 |
-
if sentence.endswith(('.', '?', '!')):
|
| 120 |
-
combined_sentences.append(current_sentence.strip())
|
| 121 |
-
current_sentence = ""
|
| 122 |
-
if current_sentence:
|
| 123 |
-
combined_sentences.append(current_sentence.strip())
|
| 124 |
-
return combined_sentences
|
| 125 |
-
|
| 126 |
-
def display_script(title, script):
|
| 127 |
-
script_sentences = split_sentences(script)
|
| 128 |
-
formatted_script = "\n\n".join(script_sentences)
|
| 129 |
-
return f"""<div class="script-box">
|
| 130 |
-
<details>
|
| 131 |
-
<summary>ํด๋ฆญํ์ฌ ํผ์น๊ธฐ</summary>
|
| 132 |
-
<div class="output-title">{title}</div>
|
| 133 |
-
<p style="white-space: pre-wrap;">{formatted_script}</p>
|
| 134 |
-
</details>
|
| 135 |
-
</div>"""
|
| 136 |
-
|
| 137 |
-
def display_summary(title, summary):
|
| 138 |
-
return f"""<div class="script-box">
|
| 139 |
-
<div class="output-title">{title}</div>
|
| 140 |
-
{summary}
|
| 141 |
-
</div>"""
|
| 142 |
-
|
| 143 |
-
def get_thumbnail_url(thumbnails):
|
| 144 |
-
for thumbnail in thumbnails:
|
| 145 |
-
if thumbnail.get("width") == 640 and thumbnail.get("height") == 480:
|
| 146 |
-
return thumbnail.get("url")
|
| 147 |
-
return "640x480 ํฌ๊ธฐ์ ์ธ๋ค์ผ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
| 148 |
-
|
| 149 |
-
def analyze(url):
|
| 150 |
-
# ์คํฌ๋ฆฝํธ ์ถ์ถ
|
| 151 |
-
yield "์คํฌ๋ฆฝํธ ์ถ์ถ ์ค...", "์คํฌ๋ฆฝํธ ์ถ์ถ ์ค...", ""
|
| 152 |
-
title, description, script, thumbnails = get_youtube_script(url)
|
| 153 |
-
script_content = display_script(title, script)
|
| 154 |
-
thumbnail_url = get_thumbnail_url(thumbnails)
|
| 155 |
-
|
| 156 |
-
# ์๋ฌธ ์คํฌ๋ฆฝํธ ํ์ ๋ฐ ์์ฝ ์์
|
| 157 |
-
yield script_content, "์์ฝ ์์ฑ ์ค...", thumbnail_url
|
| 158 |
|
| 159 |
-
|
| 160 |
-
summary = summarize_text(title, description, script)
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
{formatted_summary}
|
| 191 |
-
</div>"""
|
| 192 |
|
| 193 |
-
#
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
# Gradio ์ธํฐํ์ด์ค
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
inputs=[youtube_url_input],
|
| 208 |
-
outputs=[script_output, summary_output, thumbnail_output] # thumbnail_output ์ถ๊ฐ
|
| 209 |
-
)
|
| 210 |
-
|
| 211 |
-
if __name__ == "__main__":
|
| 212 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
import gradio as gr
|
| 5 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
# Hugging Face ํ๊ฒฝ ๋ณ์๋ก๋ถํฐ RapidAPI ํค์ ํธ์คํธ ๊ฐ์ ธ์ค๊ธฐ
|
| 8 |
+
AA_KEY = os.getenv("AA_KEY")
|
| 9 |
+
AA_HOST = "youtube-transcriptor.p.rapidapi.com"
|
| 10 |
+
|
| 11 |
+
# ์ ํ๋ธ URL์์ ๋น๋์ค ID๋ฅผ ์ถ์ถํ๋ ํจ์
|
| 12 |
+
def get_video_id(youtube_url):
|
| 13 |
+
# ์ ํ๋ธ URL ๋๋ youtu.be ๋จ์ถ URL์์ video_id ์ถ์ถ
|
| 14 |
+
video_id_match = re.search(r"(?<=v=)[^#&?]*", youtube_url) or re.search(r"(?<=youtu.be/)[^#&?]*", youtube_url)
|
| 15 |
+
return video_id_match.group(0) if video_id_match else None
|
| 16 |
+
|
| 17 |
+
# ์๋ง ์ธ์ด ์ฐ์ ์์ ๋ฆฌ์คํธ
|
| 18 |
+
LANGUAGE_PRIORITY = ['ko', 'en', 'ja', 'zh']
|
| 19 |
+
|
| 20 |
+
# ์ ํ๋ธ ์๋ง์ ์์ฒญํ๋ ํจ์ (์ธ์ด ์ฐ์ ์์๋ฅผ ์ ์ฉํ์ฌ ์๋)
|
| 21 |
+
def get_youtube_transcript(youtube_url):
|
| 22 |
+
# ๋น๋์ค ID ์ถ์ถ
|
| 23 |
+
video_id = get_video_id(youtube_url)
|
| 24 |
+
if video_id is None:
|
| 25 |
+
return {"error": "์๋ชป๋ ์ ํ๋ธ URL์
๋๋ค. ๋น๋์ค ID๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
url = "https://youtube-transcriptor.p.rapidapi.com/transcript"
|
|
|
|
| 28 |
|
| 29 |
+
headers = {
|
| 30 |
+
"x-rapidapi-key": AA_KEY,
|
| 31 |
+
"x-rapidapi-host": AA_HOST
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
# ์ธ์ด ์ฐ์ ์์์ ๋ฐ๋ผ ์๏ฟฝ๏ฟฝ์ ์ผ๋ก ์์ฒญ์ ์๋
|
| 35 |
+
for lang in LANGUAGE_PRIORITY:
|
| 36 |
+
querystring = {"video_id": video_id, "lang": lang}
|
| 37 |
+
response = requests.get(url, headers=headers, params=querystring)
|
| 38 |
+
|
| 39 |
+
# ์ํ ์ฝ๋ ํ์ธ ๋ฐ ์ ์ฒด ์๋ต ๋ฐํ
|
| 40 |
+
if response.status_code == 200:
|
| 41 |
+
try:
|
| 42 |
+
data = response.json()
|
| 43 |
+
|
| 44 |
+
# ์ ์ฒด ์๋ต ๋ฐ์ดํฐ๋ฅผ ๊ทธ๋๋ก ๋ฐํ
|
| 45 |
+
return {"language": lang, "data": data}
|
| 46 |
+
|
| 47 |
+
except json.JSONDecodeError as e:
|
| 48 |
+
return {"error": f"JSON ๋์ฝ๋ฉ ์ค๋ฅ ๋ฐ์: {str(e)}"}
|
| 49 |
+
|
| 50 |
+
# ๋ชจ๋ ์ธ์ด์์ ์๋ง์ ์ฐพ์ง ๋ชปํ ๊ฒฝ์ฐ
|
| 51 |
+
return {"error": "์ฐ์ ์์ ์ธ์ด๋ก ์๋ง์ ์ฐพ์ ์ ์์ต๋๋ค."}
|
| 52 |
+
|
| 53 |
+
# Gradio ์ธํฐํ์ด์ค ์ ์
|
| 54 |
+
def youtube_transcript_interface(youtube_url):
|
| 55 |
+
# ์๋ง ๋ฐ์ดํฐ ๊ฐ์ ธ์ค๊ธฐ
|
| 56 |
+
transcript_data = get_youtube_transcript(youtube_url)
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
# ๊ฒฐ๊ณผ ์ถ๋ ฅ
|
| 59 |
+
return json.dumps(transcript_data, ensure_ascii=False, indent=2)
|
| 60 |
+
|
| 61 |
+
# Gradio ์ธํฐํ์ด์ค ์์ฑ
|
| 62 |
+
interface = gr.Interface(
|
| 63 |
+
fn=youtube_transcript_interface,
|
| 64 |
+
inputs="text",
|
| 65 |
+
outputs="text",
|
| 66 |
+
title="YouTube ์๋ง ์ถ์ถ๊ธฐ",
|
| 67 |
+
description="์ ํ๋ธ URL์ ์
๋ ฅํ์ธ์."
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# Gradio ์ธํฐํ์ด์ค ์คํ
|
| 71 |
+
interface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|