Spaces:
Sleeping
Sleeping
| print("### APP.PY LOADED ###") | |
| import gradio as gr | |
| from bs4 import BeautifulSoup | |
| import re | |
| import logging | |
| import requests | |
| import os | |
| import tempfile | |
| logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s") | |
| # ---------- [๋ชจ๋1: ๊ธฐ์กด ๊ธฐ๋ณธ์ฝ๋] ์์ ---------- | |
| month_mapping = { | |
| "January": "1์", | |
| "February": "2์", | |
| "March": "3์", | |
| "April": "4์", | |
| "May": "5์", | |
| "June": "6์", | |
| "July": "7์", | |
| "August": "8์", | |
| "September": "9์", | |
| "October": "10์", | |
| "November": "11์", | |
| "December": "12์" | |
| } | |
| def convert_date_range(date_range_str): | |
| logging.debug("์๋ณธ ๋ ์ง ๋ฒ์: %s", date_range_str) | |
| parts = date_range_str.split('-') | |
| if len(parts) != 2: | |
| logging.debug("๋ ์ง ๋ฒ์ ํ์์ด ์ฌ๋ฐ๋ฅด์ง ์์: %s", date_range_str) | |
| return date_range_str | |
| start = parts[0].strip() | |
| end = parts[1].strip() | |
| start_parts = start.split() | |
| end_parts = end.split() | |
| if len(start_parts) < 2 or len(end_parts) < 2: | |
| logging.debug("๋ ์ง ๊ตฌ์ฑ์์ ๋ถ์กฑ: %s, %s", start, end) | |
| return date_range_str | |
| start_day = start_parts[0] | |
| start_month_en = start_parts[1] | |
| end_day = end_parts[0] | |
| end_month_en = end_parts[1] | |
| start_month = month_mapping.get(start_month_en, start_month_en) | |
| end_month = month_mapping.get(end_month_en, end_month_en) | |
| converted = f"{start_month} {start_day}์ผ ~ {end_month} {end_day}์ผ" | |
| logging.debug("๋ณํ๋ ๋ ์ง ๋ฒ์: %s", converted) | |
| return converted | |
| def process_html(html_text): | |
| logging.debug("์ ๋ ฅ HTML ์ฒ๋ฆฌ ์์") | |
| soup = BeautifulSoup(html_text, "html.parser") | |
| subject_elem = soup.find("h1") | |
| subject_name = "" | |
| if subject_elem: | |
| subject_name = subject_elem.get_text(strip=True) | |
| logging.debug("์ถ์ถ๋ ๊ณผ๋ชฉ๋ช : %s", subject_name) | |
| else: | |
| logging.debug("h1 ํ๊ทธ๋ฅผ ์ฐพ์ง ๋ชปํจ") | |
| sections_output = "" | |
| section_elements = soup.find_all("li", id=re.compile(r"^section-\d+")) | |
| logging.debug("์ฐพ์ ์น์ ๊ฐ์: %d", len(section_elements)) | |
| for section in section_elements: | |
| section_id = section.get("id") | |
| logging.debug("์ฒ๋ฆฌ ์ค์ธ ์น์ ID: %s", section_id) | |
| sec_match = re.search(r"section-(\d+)", section_id) | |
| if not sec_match: | |
| continue | |
| sec_num = int(sec_match.group(1)) | |
| if sec_num == 0: | |
| logging.debug("section-0 ์ ์คํต") | |
| continue | |
| if sec_num == 1: | |
| section_label = "Introduction" | |
| else: | |
| week_num = sec_num - 1 | |
| section_label = f"{week_num}์ฃผ์ฐจ" | |
| h3_elem = section.find("h3", class_="sectionname") | |
| date_range_text = "" | |
| if h3_elem and h3_elem.find("a"): | |
| header_text = h3_elem.find("a").get_text(strip=True) | |
| logging.debug("ํค๋ ํ ์คํธ: %s", header_text) | |
| date_match = re.search(r'(\d+\s+[A-Za-z]+\s*-\s*\d+\s+[A-Za-z]+)', header_text) | |
| if date_match: | |
| raw_date_range = date_match.group(1) | |
| date_range_text = convert_date_range(raw_date_range) | |
| else: | |
| logging.debug("๋ ์ง ๋ฒ์ ํจํด ๋งค์นญ ์คํจ: %s", header_text) | |
| else: | |
| logging.debug("h3 ๋๋ h3 ๋ด a ํ๊ทธ๋ฅผ ์ฐพ์ง ๋ชปํจ for section: %s", section_id) | |
| if sec_num == 1: | |
| section_heading = f"์น์ : {section_label}" | |
| else: | |
| section_heading = f"์น์ : {section_label} ({date_range_text})" if date_range_text else f"์น์ : {section_label}" | |
| sections_output += section_heading + "\n" | |
| iframes = section.find_all("iframe") | |
| logging.debug("์น์ %s ๋ด ์ฐพ์ iframe ๊ฐ์: %d", section_id, len(iframes)) | |
| for idx, iframe in enumerate(iframes, start=1): | |
| video_url = iframe.get("src", "").strip() | |
| if video_url: | |
| sections_output += f"๊ฐ์{idx} : {video_url}\n" | |
| logging.debug("์ถ์ถ๋ ๋์์ ๊ฐ์ URL: %s", video_url) | |
| sections_output += "\n" | |
| logging.debug("HTML ์ฒ๋ฆฌ ์๋ฃ") | |
| return subject_name, sections_output | |
| def process_html_sections(html_text): | |
| logging.debug("์ ๋ ฅ HTML ์ฒ๋ฆฌ ์์ (์น์ ๋ณ ๋ถ๋ฆฌ) len=%d", len(html_text) if html_text else 0) | |
| soup = BeautifulSoup(html_text, "html.parser") | |
| subject_elem = soup.find("h1") | |
| subject_name = "" | |
| if subject_elem: | |
| subject_name = subject_elem.get_text(strip=True) | |
| logging.debug("์ถ์ถ๋ ๊ณผ๋ชฉ๋ช : %s", subject_name) | |
| else: | |
| logging.debug("h1 ํ๊ทธ๋ฅผ ์ฐพ์ง ๋ชปํจ") | |
| sections_dict = {} | |
| section_elements = soup.find_all("li", id=re.compile(r"^section-\d+")) | |
| logging.debug("์ฐพ์ ์น์ ๊ฐ์: %d", len(section_elements)) | |
| for section in section_elements: | |
| section_id = section.get("id") | |
| logging.debug("์ฒ๋ฆฌ ์ค์ธ ์น์ ID: %s", section_id) | |
| sec_match = re.search(r"section-(\d+)", section_id) | |
| if not sec_match: | |
| continue | |
| sec_num = int(sec_match.group(1)) | |
| if sec_num == 0: | |
| logging.debug("section-0 ์ ์คํต") | |
| continue | |
| if sec_num == 1: | |
| section_label = "์น์ : Introduction" | |
| else: | |
| week_num = sec_num - 1 | |
| section_label = f"์น์ : {week_num}์ฃผ์ฐจ" | |
| h3_elem = section.find("h3", class_="sectionname") | |
| date_range_text = "" | |
| if h3_elem and h3_elem.find("a"): | |
| header_text = h3_elem.find("a").get_text(strip=True) | |
| date_match = re.search(r'(\d+\s+[A-Za-z]+\s*-\s*\d+\s+[A-Za-z]+)', header_text) | |
| if date_match: | |
| raw_date_range = date_match.group(1) | |
| date_range_text = convert_date_range(raw_date_range) | |
| if sec_num != 1 and date_range_text: | |
| section_label += f" ({date_range_text})" | |
| lectures_str = "" | |
| url_list = [] | |
| iframes = section.find_all("iframe") | |
| logging.debug("์น์ %s ๋ด ์ฐพ์ iframe ๊ฐ์: %d", section_id, len(iframes)) | |
| for idx, iframe in enumerate(iframes, start=1): | |
| video_url = iframe.get("src", "").strip() | |
| if video_url: | |
| lectures_str += f"๊ฐ์{idx} : {video_url}\n" | |
| url_list.append(video_url) | |
| if not url_list: | |
| lectures_str = "๊ฐ์๊ฐ ์์ต๋๋ค" | |
| sections_dict[section_label] = (lectures_str.strip(), url_list) | |
| sections_list = list(sections_dict.keys()) | |
| default_val = sections_list[0] if sections_list else None | |
| return subject_name, gr.update(choices=sections_list, value=default_val), sections_dict | |
| def update_lecture_text_only(selected_section, sections_dict): | |
| if not selected_section or not sections_dict: | |
| return "" | |
| lectures_text, _ = sections_dict.get(selected_section, ("", [])) | |
| if not lectures_text: | |
| lectures_text = "๊ฐ์๊ฐ ์์ต๋๋ค" | |
| logging.debug("update_lecture_text_only - ์ ํ๋ ์น์ : %s", selected_section) | |
| return lectures_text | |
| # ---------- [๋ชจ๋1: ๊ธฐ์กด ๊ธฐ๋ณธ์ฝ๋] ๋ ---------- | |
| # ---------- [๋ชจ๋2: ์ถ๊ฐ์ฝ๋] ์์ ---------- | |
| def fetch_page_source(url): | |
| try: | |
| logging.debug("๊ฐ์ ํ์ด์ง๋ฅผ ๊ฐ์ ธ์ค๋ ์ค: %s", url) | |
| response = requests.get(url, timeout=20) | |
| response.raise_for_status() | |
| logging.debug("ํ์ด์ง ์์ค๋ฅผ ์ฑ๊ณต์ ์ผ๋ก ๊ฐ์ ธ์ด len=%d", len(response.text)) | |
| return response.text | |
| except Exception as e: | |
| logging.error("ํ์ด์ง ์์ค ๊ฐ์ ธ์ค๊ธฐ ์ค๋ฅ: %s", e, exc_info=True) | |
| return "์ค๋ฅ ๋ฐ์: " + str(e) | |
| def _normalize_vimeo_url(url_str: str) -> str: | |
| if not url_str: | |
| return "" | |
| return url_str.replace("\\u0026", "&").replace("\\/", "/") | |
| def create_script_url(lecture_url): | |
| page_source = fetch_page_source(lecture_url) | |
| if page_source.startswith("์ค๋ฅ ๋ฐ์:"): | |
| return "" | |
| pattern = r'"text_tracks"\s*:\s*\[\s*\{[^}]*"url"\s*:\s*"([^"]+)"' | |
| match = re.search(pattern, page_source) | |
| if match: | |
| raw_url = _normalize_vimeo_url(match.group(1)) | |
| # ์ ๋ URL์ด๋ฉด ๊ทธ๋๋ก ์ฌ์ฉ (player.vimeo.comhttps ์ค๋ฅ ๋ฐฉ์ง) | |
| if raw_url.startswith("https://") or raw_url.startswith("http://"): | |
| script_url = raw_url | |
| elif raw_url.startswith("//"): | |
| script_url = "https:" + raw_url | |
| else: | |
| script_url = "https://player.vimeo.com" + raw_url | |
| logging.debug("์คํฌ๋ฆฝํธ URL ์์ฑ: %s", script_url) | |
| return script_url | |
| logging.debug("ํ์ด์ง ์์ค์์ ์คํฌ๋ฆฝํธ URL์ ์ฐพ์ง ๋ชปํจ") | |
| return "" | |
| def fetch_script(script_url): | |
| try: | |
| logging.debug("์คํฌ๋ฆฝํธ๋ฅผ ๊ฐ์ ธ์ค๋ ์ค: %s", script_url) | |
| response = requests.get(script_url, timeout=20) | |
| response.raise_for_status() | |
| logging.debug("์คํฌ๋ฆฝํธ๋ฅผ ์ฑ๊ณต์ ์ผ๋ก ๊ฐ์ ธ์ด len=%d", len(response.text)) | |
| return response.text | |
| except Exception as e: | |
| logging.error("์คํฌ๋ฆฝํธ ๊ฐ์ ธ์ค๊ธฐ ์ค๋ฅ: %s", e, exc_info=True) | |
| return "์ค๋ฅ ๋ฐ์: " + str(e) | |
| def remove_timeline(script_text, lecture_number): | |
| lines = script_text.splitlines() | |
| valid_lines = [] | |
| for line in lines: | |
| stripped_line = line.strip() | |
| if stripped_line == "": | |
| continue | |
| if re.match(r'^\d+$', stripped_line): | |
| continue | |
| if re.match(r'^\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?\s*-->\s*\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?$', stripped_line): | |
| continue | |
| valid_lines.append(stripped_line) | |
| cleaned_text = "".join(valid_lines) | |
| cleaned_text = re.sub(r'\.(\S)', r'. \1', cleaned_text) | |
| cleaned_text = re.sub(r'^WEBVTT\s*', '', cleaned_text) | |
| return cleaned_text | |
| def process_full(lecture_url): | |
| script_url = create_script_url(lecture_url) | |
| if not script_url: | |
| return "์คํฌ๋ฆฝํธ URL ์์ฑ ์คํจ" | |
| script_text = fetch_script(script_url) | |
| if "์ค๋ฅ ๋ฐ์" in script_text: | |
| return script_text | |
| cleaned = remove_timeline(script_text, 1) | |
| return cleaned | |
| # ---------- [๋ชจ๋2: ์ถ๊ฐ์ฝ๋] ๋ ---------- | |
| # ---------- [๋ชจ๋3: ๋ณํฉ/์ ์ฅ] ---------- | |
| def merge_contents_global(l1, l2, l3): | |
| merged = "" | |
| if l1.strip(): | |
| merged += "[๊ฐ์1]\n" + l1.strip() | |
| if l2.strip(): | |
| if merged: | |
| merged += "\n\n" | |
| merged += "[๊ฐ์2]\n" + l2.strip() | |
| if l3.strip(): | |
| if merged: | |
| merged += "\n\n" | |
| merged += "[๊ฐ์3]\n" + l3.strip() | |
| return merged | |
| def handle_fetch_all(lecture_list_text): | |
| logging.debug("๊ฐ์ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ ๋ฒํผ ํด๋ฆญ๋จ. len=%d", len(lecture_list_text) if lecture_list_text else 0) | |
| lines = (lecture_list_text or "").splitlines() | |
| urls = [] | |
| for line in lines: | |
| m = re.match(r"๊ฐ์\d+\s*:\s*(.+)", line.strip()) | |
| if m: | |
| urls.append(m.group(1).strip()) | |
| urls = urls[:3] | |
| while len(urls) < 3: | |
| urls.append("") | |
| lec_contents = [] | |
| for url in urls: | |
| lec_contents.append(process_full(url) if url else "") | |
| merged = merge_contents_global(lec_contents[0], lec_contents[1], lec_contents[2]) | |
| return urls[0], urls[1], urls[2], lec_contents[0], lec_contents[1], lec_contents[2], merged | |
| def _sanitize_filename(name: str) -> str: | |
| if not name: | |
| return "output" | |
| return re.sub(r'[\\/:*?"<>|]+', "_", name).strip() | |
| def _extract_section_token(section_label: str) -> str: | |
| if not section_label: | |
| return "์น์ " | |
| s = section_label.strip().replace("์น์ : ", "").strip() | |
| s = re.sub(r"\s*\(.*\)\s*$", "", s).strip() | |
| return s if s else "์น์ " | |
| def save_merged_to_txt(subject_name, selected_section, merged_text): | |
| try: | |
| content = merged_text if merged_text is not None else "" | |
| subj = subject_name if subject_name else "๊ณผ๋ชฉ๋ช " | |
| sec_token = _extract_section_token(selected_section) | |
| filename = _sanitize_filename(f"{subj}(์คํฌ๋ฆฝํธ)_{sec_token}.txt") | |
| path = os.path.join(tempfile.gettempdir(), filename) | |
| with open(path, "w", encoding="utf-8") as f: | |
| # ๋ด์ฉ ์์ ์์ด ๊ทธ๋๋ก ์ ์ฅ | |
| f.write(content) | |
| logging.debug("TXT ์ ์ฅ ์๋ฃ: %s", path) | |
| return path | |
| except Exception as e: | |
| logging.exception("save_merged_to_txt ์ค๋ฅ: %s", e) | |
| return None | |
| def load_html_from_file(file_obj): | |
| try: | |
| logging.debug("load_html_from_file ์ง์ ") | |
| if file_obj is None: | |
| return "", "์ค๋ฅ: ์ ๋ก๋๋ ํ์ผ์ด ์์ต๋๋ค." | |
| file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj) | |
| logging.debug("ํ์ผ์์ HTML ๋ก๋: %s", file_path) | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| data = f.read() | |
| return data, f"OK: ํ์ผ ๋ก๋ ์ฑ๊ณต (len={len(data)})" | |
| except Exception as e: | |
| logging.exception("load_html_from_file ์ค๋ฅ: %s", e) | |
| return "", f"์ค๋ฅ: {repr(e)}" | |
| # โ ํ๋ก ํธ-๋ฐฑ์๋ ์ฐ๊ฒฐ ํ ์คํธ: ํด๋ฆญ ์ status_out์ด ๋ฐ๋์ ๋ฐ๋์ด์ผ ์ ์ | |
| def test_click(_): | |
| logging.debug("### BUTTON CLICKED (test_click) ###") | |
| return "clicked" | |
| # ---------- Gradio ์ฑ ---------- | |
| with gr.Blocks() as app: | |
| gr.Markdown("# ์บ๋กค๋ผ์ธ๋ํ ๊ฐ์ ์ถ์ถ๊ธฐ Ver.2.4") | |
| with gr.Tab("HTML ํ์ฑ ๋ฐ ์น์ ์ ํ"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| html_input = gr.Textbox(label="์ ์ฒด ํ์ด์ง HTML ์ ๋ ฅ", lines=20, placeholder="HTML ์ฝ๋๋ฅผ ์ ๋ ฅํ์ธ์...") | |
| html_file = gr.File(label="HTML ํ์ผ ์ ๋ก๋(.txt/.html)", file_types=[".txt", ".html", ".htm"]) | |
| load_btn = gr.Button("์ ๋ก๋ ํ์ผ์ ์ ๋ ฅ์ฐฝ์ ๋ถ๋ฌ์ค๊ธฐ") | |
| parse_btn = gr.Button("Submit") | |
| # โ ์ฐ๊ฒฐ ํ ์คํธ ๋ฒํผ | |
| test_btn = gr.Button("์ฐ๊ฒฐ ํ ์คํธ(ํด๋ฆญ ํ์ธ)") | |
| status_out = gr.Textbox(label="์ํ/๋๋ฒ๊ทธ", interactive=False) | |
| with gr.Column(): | |
| subject_out = gr.Textbox(label="๊ณผ๋ชฉ๋ช ", interactive=False) | |
| section_dropdown = gr.Dropdown(label="์น์ ์ ํ", choices=[], interactive=True) | |
| lecture_out = gr.Textbox(label="์ ํํ ์น์ ๊ฐ์ ๋ชฉ๋ก", lines=10, interactive=False) | |
| fetch_all_btn = gr.Button("๊ฐ์ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ", elem_id="fetch_all_btn") | |
| sections_state = gr.State() | |
| load_btn.click( | |
| fn=load_html_from_file, | |
| inputs=[html_file], | |
| outputs=[html_input, status_out] | |
| ) | |
| # โ ์ฐ๊ฒฐ ํ ์คํธ: status_out์ด "clicked"๋ก ๋ฐ๋์ด์ผ ํจ | |
| test_btn.click( | |
| fn=test_click, | |
| inputs=[html_input], | |
| outputs=[status_out] | |
| ) | |
| parse_btn.click( | |
| fn=process_html_sections, | |
| inputs=html_input, | |
| outputs=[subject_out, section_dropdown, sections_state] | |
| ) | |
| section_dropdown.change( | |
| fn=update_lecture_text_only, | |
| inputs=[section_dropdown, sections_state], | |
| outputs=lecture_out | |
| ) | |
| gr.Markdown("## ๊ฐ์ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ") | |
| with gr.Row(): | |
| url1 = gr.Textbox(label="๊ฐ์1 URL", elem_id="url1") | |
| url2 = gr.Textbox(label="๊ฐ์2 URL", elem_id="url2") | |
| url3 = gr.Textbox(label="๊ฐ์3 URL", elem_id="url3") | |
| with gr.Row(): | |
| lecture_content1 = gr.Textbox(label="๊ฐ์ ๋ด์ฉ", lines=10, elem_id="lecture_content1") | |
| lecture_content2 = gr.Textbox(label="๊ฐ์ ๋ด์ฉ", lines=10, elem_id="lecture_content2") | |
| lecture_content3 = gr.Textbox(label="๊ฐ์ ๋ด์ฉ", lines=10, elem_id="lecture_content3") | |
| gr.Markdown("## ๊ฐ์ ๋ด์ฉ ํฉ์น๊ธฐ") | |
| merged_content = gr.Textbox(label="์ ์ฒด ๊ฐ์ ๋ด์ฉ", lines=10, elem_id="merged_content") | |
| fetch_all_btn.click( | |
| fn=handle_fetch_all, | |
| inputs=lecture_out, | |
| outputs=[url1, url2, url3, lecture_content1, lecture_content2, lecture_content3, merged_content] | |
| ) | |
| with gr.Row(): | |
| save_btn = gr.Button("ํ์ผ๋ก ์ ์ฅ", elem_id="save_btn") | |
| download_file = gr.File(label="์ ์ฅ๋ ํ์ผ ๋ค์ด๋ก๋", interactive=False, elem_id="download_file") | |
| save_btn.click( | |
| fn=save_merged_to_txt, | |
| inputs=[subject_out, section_dropdown, merged_content], | |
| outputs=[download_file] | |
| ) | |
| app.queue() | |
| if __name__ == "__main__": | |
| logging.debug("ํตํฉ Gradio ์ฑ ์คํ ์ค") | |
| app.launch(debug=True) | |