unnastyle's picture
Create app.py
80b36c6 verified
print("### APP.PY LOADED ###")
import gradio as gr
from bs4 import BeautifulSoup
import re
import logging
import requests
import os
import tempfile
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
# ---------- [๋ชจ๋“ˆ1: ๊ธฐ์กด ๊ธฐ๋ณธ์ฝ”๋“œ] ์‹œ์ž‘ ----------
month_mapping = {
"January": "1์›”",
"February": "2์›”",
"March": "3์›”",
"April": "4์›”",
"May": "5์›”",
"June": "6์›”",
"July": "7์›”",
"August": "8์›”",
"September": "9์›”",
"October": "10์›”",
"November": "11์›”",
"December": "12์›”"
}
def convert_date_range(date_range_str):
logging.debug("์›๋ณธ ๋‚ ์งœ ๋ฒ”์œ„: %s", date_range_str)
parts = date_range_str.split('-')
if len(parts) != 2:
logging.debug("๋‚ ์งœ ๋ฒ”์œ„ ํ˜•์‹์ด ์˜ฌ๋ฐ”๋ฅด์ง€ ์•Š์Œ: %s", date_range_str)
return date_range_str
start = parts[0].strip()
end = parts[1].strip()
start_parts = start.split()
end_parts = end.split()
if len(start_parts) < 2 or len(end_parts) < 2:
logging.debug("๋‚ ์งœ ๊ตฌ์„ฑ์š”์†Œ ๋ถ€์กฑ: %s, %s", start, end)
return date_range_str
start_day = start_parts[0]
start_month_en = start_parts[1]
end_day = end_parts[0]
end_month_en = end_parts[1]
start_month = month_mapping.get(start_month_en, start_month_en)
end_month = month_mapping.get(end_month_en, end_month_en)
converted = f"{start_month} {start_day}์ผ ~ {end_month} {end_day}์ผ"
logging.debug("๋ณ€ํ™˜๋œ ๋‚ ์งœ ๋ฒ”์œ„: %s", converted)
return converted
def process_html(html_text):
logging.debug("์ž…๋ ฅ HTML ์ฒ˜๋ฆฌ ์‹œ์ž‘")
soup = BeautifulSoup(html_text, "html.parser")
subject_elem = soup.find("h1")
subject_name = ""
if subject_elem:
subject_name = subject_elem.get_text(strip=True)
logging.debug("์ถ”์ถœ๋œ ๊ณผ๋ชฉ๋ช…: %s", subject_name)
else:
logging.debug("h1 ํƒœ๊ทธ๋ฅผ ์ฐพ์ง€ ๋ชปํ•จ")
sections_output = ""
section_elements = soup.find_all("li", id=re.compile(r"^section-\d+"))
logging.debug("์ฐพ์€ ์„น์…˜ ๊ฐœ์ˆ˜: %d", len(section_elements))
for section in section_elements:
section_id = section.get("id")
logging.debug("์ฒ˜๋ฆฌ ์ค‘์ธ ์„น์…˜ ID: %s", section_id)
sec_match = re.search(r"section-(\d+)", section_id)
if not sec_match:
continue
sec_num = int(sec_match.group(1))
if sec_num == 0:
logging.debug("section-0 ์€ ์Šคํ‚ต")
continue
if sec_num == 1:
section_label = "Introduction"
else:
week_num = sec_num - 1
section_label = f"{week_num}์ฃผ์ฐจ"
h3_elem = section.find("h3", class_="sectionname")
date_range_text = ""
if h3_elem and h3_elem.find("a"):
header_text = h3_elem.find("a").get_text(strip=True)
logging.debug("ํ—ค๋” ํ…์ŠคํŠธ: %s", header_text)
date_match = re.search(r'(\d+\s+[A-Za-z]+\s*-\s*\d+\s+[A-Za-z]+)', header_text)
if date_match:
raw_date_range = date_match.group(1)
date_range_text = convert_date_range(raw_date_range)
else:
logging.debug("๋‚ ์งœ ๋ฒ”์œ„ ํŒจํ„ด ๋งค์นญ ์‹คํŒจ: %s", header_text)
else:
logging.debug("h3 ๋˜๋Š” h3 ๋‚ด a ํƒœ๊ทธ๋ฅผ ์ฐพ์ง€ ๋ชปํ•จ for section: %s", section_id)
if sec_num == 1:
section_heading = f"์„น์…˜ : {section_label}"
else:
section_heading = f"์„น์…˜ : {section_label} ({date_range_text})" if date_range_text else f"์„น์…˜ : {section_label}"
sections_output += section_heading + "\n"
iframes = section.find_all("iframe")
logging.debug("์„น์…˜ %s ๋‚ด ์ฐพ์€ iframe ๊ฐœ์ˆ˜: %d", section_id, len(iframes))
for idx, iframe in enumerate(iframes, start=1):
video_url = iframe.get("src", "").strip()
if video_url:
sections_output += f"๊ฐ•์˜{idx} : {video_url}\n"
logging.debug("์ถ”์ถœ๋œ ๋™์˜์ƒ ๊ฐ•์˜ URL: %s", video_url)
sections_output += "\n"
logging.debug("HTML ์ฒ˜๋ฆฌ ์™„๋ฃŒ")
return subject_name, sections_output
def process_html_sections(html_text):
logging.debug("์ž…๋ ฅ HTML ์ฒ˜๋ฆฌ ์‹œ์ž‘ (์„น์…˜๋ณ„ ๋ถ„๋ฆฌ) len=%d", len(html_text) if html_text else 0)
soup = BeautifulSoup(html_text, "html.parser")
subject_elem = soup.find("h1")
subject_name = ""
if subject_elem:
subject_name = subject_elem.get_text(strip=True)
logging.debug("์ถ”์ถœ๋œ ๊ณผ๋ชฉ๋ช…: %s", subject_name)
else:
logging.debug("h1 ํƒœ๊ทธ๋ฅผ ์ฐพ์ง€ ๋ชปํ•จ")
sections_dict = {}
section_elements = soup.find_all("li", id=re.compile(r"^section-\d+"))
logging.debug("์ฐพ์€ ์„น์…˜ ๊ฐœ์ˆ˜: %d", len(section_elements))
for section in section_elements:
section_id = section.get("id")
logging.debug("์ฒ˜๋ฆฌ ์ค‘์ธ ์„น์…˜ ID: %s", section_id)
sec_match = re.search(r"section-(\d+)", section_id)
if not sec_match:
continue
sec_num = int(sec_match.group(1))
if sec_num == 0:
logging.debug("section-0 ์€ ์Šคํ‚ต")
continue
if sec_num == 1:
section_label = "์„น์…˜ : Introduction"
else:
week_num = sec_num - 1
section_label = f"์„น์…˜ : {week_num}์ฃผ์ฐจ"
h3_elem = section.find("h3", class_="sectionname")
date_range_text = ""
if h3_elem and h3_elem.find("a"):
header_text = h3_elem.find("a").get_text(strip=True)
date_match = re.search(r'(\d+\s+[A-Za-z]+\s*-\s*\d+\s+[A-Za-z]+)', header_text)
if date_match:
raw_date_range = date_match.group(1)
date_range_text = convert_date_range(raw_date_range)
if sec_num != 1 and date_range_text:
section_label += f" ({date_range_text})"
lectures_str = ""
url_list = []
iframes = section.find_all("iframe")
logging.debug("์„น์…˜ %s ๋‚ด ์ฐพ์€ iframe ๊ฐœ์ˆ˜: %d", section_id, len(iframes))
for idx, iframe in enumerate(iframes, start=1):
video_url = iframe.get("src", "").strip()
if video_url:
lectures_str += f"๊ฐ•์˜{idx} : {video_url}\n"
url_list.append(video_url)
if not url_list:
lectures_str = "๊ฐ•์˜๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค"
sections_dict[section_label] = (lectures_str.strip(), url_list)
sections_list = list(sections_dict.keys())
default_val = sections_list[0] if sections_list else None
return subject_name, gr.update(choices=sections_list, value=default_val), sections_dict
def update_lecture_text_only(selected_section, sections_dict):
if not selected_section or not sections_dict:
return ""
lectures_text, _ = sections_dict.get(selected_section, ("", []))
if not lectures_text:
lectures_text = "๊ฐ•์˜๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค"
logging.debug("update_lecture_text_only - ์„ ํƒ๋œ ์„น์…˜: %s", selected_section)
return lectures_text
# ---------- [๋ชจ๋“ˆ1: ๊ธฐ์กด ๊ธฐ๋ณธ์ฝ”๋“œ] ๋ ----------
# ---------- [๋ชจ๋“ˆ2: ์ถ”๊ฐ€์ฝ”๋“œ] ์‹œ์ž‘ ----------
def fetch_page_source(url):
try:
logging.debug("๊ฐ•์˜ ํŽ˜์ด์ง€๋ฅผ ๊ฐ€์ ธ์˜ค๋Š” ์ค‘: %s", url)
response = requests.get(url, timeout=20)
response.raise_for_status()
logging.debug("ํŽ˜์ด์ง€ ์†Œ์Šค๋ฅผ ์„ฑ๊ณต์ ์œผ๋กœ ๊ฐ€์ ธ์˜ด len=%d", len(response.text))
return response.text
except Exception as e:
logging.error("ํŽ˜์ด์ง€ ์†Œ์Šค ๊ฐ€์ ธ์˜ค๊ธฐ ์˜ค๋ฅ˜: %s", e, exc_info=True)
return "์˜ค๋ฅ˜ ๋ฐœ์ƒ: " + str(e)
def _normalize_vimeo_url(url_str: str) -> str:
if not url_str:
return ""
return url_str.replace("\\u0026", "&").replace("\\/", "/")
def create_script_url(lecture_url):
page_source = fetch_page_source(lecture_url)
if page_source.startswith("์˜ค๋ฅ˜ ๋ฐœ์ƒ:"):
return ""
pattern = r'"text_tracks"\s*:\s*\[\s*\{[^}]*"url"\s*:\s*"([^"]+)"'
match = re.search(pattern, page_source)
if match:
raw_url = _normalize_vimeo_url(match.group(1))
# ์ ˆ๋Œ€ URL์ด๋ฉด ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ (player.vimeo.comhttps ์˜ค๋ฅ˜ ๋ฐฉ์ง€)
if raw_url.startswith("https://") or raw_url.startswith("http://"):
script_url = raw_url
elif raw_url.startswith("//"):
script_url = "https:" + raw_url
else:
script_url = "https://player.vimeo.com" + raw_url
logging.debug("์Šคํฌ๋ฆฝํŠธ URL ์™„์„ฑ: %s", script_url)
return script_url
logging.debug("ํŽ˜์ด์ง€ ์†Œ์Šค์—์„œ ์Šคํฌ๋ฆฝํŠธ URL์„ ์ฐพ์ง€ ๋ชปํ•จ")
return ""
def fetch_script(script_url):
try:
logging.debug("์Šคํฌ๋ฆฝํŠธ๋ฅผ ๊ฐ€์ ธ์˜ค๋Š” ์ค‘: %s", script_url)
response = requests.get(script_url, timeout=20)
response.raise_for_status()
logging.debug("์Šคํฌ๋ฆฝํŠธ๋ฅผ ์„ฑ๊ณต์ ์œผ๋กœ ๊ฐ€์ ธ์˜ด len=%d", len(response.text))
return response.text
except Exception as e:
logging.error("์Šคํฌ๋ฆฝํŠธ ๊ฐ€์ ธ์˜ค๊ธฐ ์˜ค๋ฅ˜: %s", e, exc_info=True)
return "์˜ค๋ฅ˜ ๋ฐœ์ƒ: " + str(e)
def remove_timeline(script_text, lecture_number):
lines = script_text.splitlines()
valid_lines = []
for line in lines:
stripped_line = line.strip()
if stripped_line == "":
continue
if re.match(r'^\d+$', stripped_line):
continue
if re.match(r'^\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?\s*-->\s*\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?$', stripped_line):
continue
valid_lines.append(stripped_line)
cleaned_text = "".join(valid_lines)
cleaned_text = re.sub(r'\.(\S)', r'. \1', cleaned_text)
cleaned_text = re.sub(r'^WEBVTT\s*', '', cleaned_text)
return cleaned_text
def process_full(lecture_url):
script_url = create_script_url(lecture_url)
if not script_url:
return "์Šคํฌ๋ฆฝํŠธ URL ์ƒ์„ฑ ์‹คํŒจ"
script_text = fetch_script(script_url)
if "์˜ค๋ฅ˜ ๋ฐœ์ƒ" in script_text:
return script_text
cleaned = remove_timeline(script_text, 1)
return cleaned
# ---------- [๋ชจ๋“ˆ2: ์ถ”๊ฐ€์ฝ”๋“œ] ๋ ----------
# ---------- [๋ชจ๋“ˆ3: ๋ณ‘ํ•ฉ/์ €์žฅ] ----------
def merge_contents_global(l1, l2, l3):
merged = ""
if l1.strip():
merged += "[๊ฐ•์˜1]\n" + l1.strip()
if l2.strip():
if merged:
merged += "\n\n"
merged += "[๊ฐ•์˜2]\n" + l2.strip()
if l3.strip():
if merged:
merged += "\n\n"
merged += "[๊ฐ•์˜3]\n" + l3.strip()
return merged
def handle_fetch_all(lecture_list_text):
logging.debug("๊ฐ•์˜ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ ๋ฒ„ํŠผ ํด๋ฆญ๋จ. len=%d", len(lecture_list_text) if lecture_list_text else 0)
lines = (lecture_list_text or "").splitlines()
urls = []
for line in lines:
m = re.match(r"๊ฐ•์˜\d+\s*:\s*(.+)", line.strip())
if m:
urls.append(m.group(1).strip())
urls = urls[:3]
while len(urls) < 3:
urls.append("")
lec_contents = []
for url in urls:
lec_contents.append(process_full(url) if url else "")
merged = merge_contents_global(lec_contents[0], lec_contents[1], lec_contents[2])
return urls[0], urls[1], urls[2], lec_contents[0], lec_contents[1], lec_contents[2], merged
def _sanitize_filename(name: str) -> str:
if not name:
return "output"
return re.sub(r'[\\/:*?"<>|]+', "_", name).strip()
def _extract_section_token(section_label: str) -> str:
if not section_label:
return "์„น์…˜"
s = section_label.strip().replace("์„น์…˜ : ", "").strip()
s = re.sub(r"\s*\(.*\)\s*$", "", s).strip()
return s if s else "์„น์…˜"
def save_merged_to_txt(subject_name, selected_section, merged_text):
try:
content = merged_text if merged_text is not None else ""
subj = subject_name if subject_name else "๊ณผ๋ชฉ๋ช…"
sec_token = _extract_section_token(selected_section)
filename = _sanitize_filename(f"{subj}(์Šคํฌ๋ฆฝํŠธ)_{sec_token}.txt")
path = os.path.join(tempfile.gettempdir(), filename)
with open(path, "w", encoding="utf-8") as f:
# ๋‚ด์šฉ ์ˆ˜์ • ์—†์ด ๊ทธ๋Œ€๋กœ ์ €์žฅ
f.write(content)
logging.debug("TXT ์ €์žฅ ์™„๋ฃŒ: %s", path)
return path
except Exception as e:
logging.exception("save_merged_to_txt ์˜ค๋ฅ˜: %s", e)
return None
def load_html_from_file(file_obj):
try:
logging.debug("load_html_from_file ์ง„์ž…")
if file_obj is None:
return "", "์˜ค๋ฅ˜: ์—…๋กœ๋“œ๋œ ํŒŒ์ผ์ด ์—†์Šต๋‹ˆ๋‹ค."
file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
logging.debug("ํŒŒ์ผ์—์„œ HTML ๋กœ๋“œ: %s", file_path)
with open(file_path, "r", encoding="utf-8") as f:
data = f.read()
return data, f"OK: ํŒŒ์ผ ๋กœ๋“œ ์„ฑ๊ณต (len={len(data)})"
except Exception as e:
logging.exception("load_html_from_file ์˜ค๋ฅ˜: %s", e)
return "", f"์˜ค๋ฅ˜: {repr(e)}"
# โœ… ํ”„๋ก ํŠธ-๋ฐฑ์—”๋“œ ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ: ํด๋ฆญ ์‹œ status_out์ด ๋ฐ˜๋“œ์‹œ ๋ฐ”๋€Œ์–ด์•ผ ์ •์ƒ
def test_click(_):
logging.debug("### BUTTON CLICKED (test_click) ###")
return "clicked"
# ---------- Gradio ์•ฑ ----------
with gr.Blocks() as app:
gr.Markdown("# ์บ๋กค๋ผ์ธ๋Œ€ํ•™ ๊ฐ•์˜ ์ถ”์ถœ๊ธฐ Ver.2.4")
with gr.Tab("HTML ํŒŒ์‹ฑ ๋ฐ ์„น์…˜ ์„ ํƒ"):
with gr.Row():
with gr.Column():
html_input = gr.Textbox(label="์ „์ฒด ํŽ˜์ด์ง€ HTML ์ž…๋ ฅ", lines=20, placeholder="HTML ์ฝ”๋“œ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”...")
html_file = gr.File(label="HTML ํŒŒ์ผ ์—…๋กœ๋“œ(.txt/.html)", file_types=[".txt", ".html", ".htm"])
load_btn = gr.Button("์—…๋กœ๋“œ ํŒŒ์ผ์„ ์ž…๋ ฅ์ฐฝ์— ๋ถˆ๋Ÿฌ์˜ค๊ธฐ")
parse_btn = gr.Button("Submit")
# โœ… ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ ๋ฒ„ํŠผ
test_btn = gr.Button("์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ(ํด๋ฆญ ํ™•์ธ)")
status_out = gr.Textbox(label="์ƒํƒœ/๋””๋ฒ„๊ทธ", interactive=False)
with gr.Column():
subject_out = gr.Textbox(label="๊ณผ๋ชฉ๋ช…", interactive=False)
section_dropdown = gr.Dropdown(label="์„น์…˜ ์„ ํƒ", choices=[], interactive=True)
lecture_out = gr.Textbox(label="์„ ํƒํ•œ ์„น์…˜ ๊ฐ•์˜ ๋ชฉ๋ก", lines=10, interactive=False)
fetch_all_btn = gr.Button("๊ฐ•์˜ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ", elem_id="fetch_all_btn")
sections_state = gr.State()
load_btn.click(
fn=load_html_from_file,
inputs=[html_file],
outputs=[html_input, status_out]
)
# โœ… ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ: status_out์ด "clicked"๋กœ ๋ฐ”๋€Œ์–ด์•ผ ํ•จ
test_btn.click(
fn=test_click,
inputs=[html_input],
outputs=[status_out]
)
parse_btn.click(
fn=process_html_sections,
inputs=html_input,
outputs=[subject_out, section_dropdown, sections_state]
)
section_dropdown.change(
fn=update_lecture_text_only,
inputs=[section_dropdown, sections_state],
outputs=lecture_out
)
gr.Markdown("## ๊ฐ•์˜ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ")
with gr.Row():
url1 = gr.Textbox(label="๊ฐ•์˜1 URL", elem_id="url1")
url2 = gr.Textbox(label="๊ฐ•์˜2 URL", elem_id="url2")
url3 = gr.Textbox(label="๊ฐ•์˜3 URL", elem_id="url3")
with gr.Row():
lecture_content1 = gr.Textbox(label="๊ฐ•์˜ ๋‚ด์šฉ", lines=10, elem_id="lecture_content1")
lecture_content2 = gr.Textbox(label="๊ฐ•์˜ ๋‚ด์šฉ", lines=10, elem_id="lecture_content2")
lecture_content3 = gr.Textbox(label="๊ฐ•์˜ ๋‚ด์šฉ", lines=10, elem_id="lecture_content3")
gr.Markdown("## ๊ฐ•์˜ ๋‚ด์šฉ ํ•ฉ์น˜๊ธฐ")
merged_content = gr.Textbox(label="์ „์ฒด ๊ฐ•์˜ ๋‚ด์šฉ", lines=10, elem_id="merged_content")
fetch_all_btn.click(
fn=handle_fetch_all,
inputs=lecture_out,
outputs=[url1, url2, url3, lecture_content1, lecture_content2, lecture_content3, merged_content]
)
with gr.Row():
save_btn = gr.Button("ํŒŒ์ผ๋กœ ์ €์žฅ", elem_id="save_btn")
download_file = gr.File(label="์ €์žฅ๋œ ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ", interactive=False, elem_id="download_file")
save_btn.click(
fn=save_merged_to_txt,
inputs=[subject_out, section_dropdown, merged_content],
outputs=[download_file]
)
app.queue()
if __name__ == "__main__":
logging.debug("ํ†ตํ•ฉ Gradio ์•ฑ ์‹คํ–‰ ์ค‘")
app.launch(debug=True)