print("### APP.PY LOADED ###") import gradio as gr from bs4 import BeautifulSoup import re import logging import requests import os import tempfile logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s") # ---------- [모듈1: 기존 기본코드] 시작 ---------- month_mapping = { "January": "1월", "February": "2월", "March": "3월", "April": "4월", "May": "5월", "June": "6월", "July": "7월", "August": "8월", "September": "9월", "October": "10월", "November": "11월", "December": "12월" } def convert_date_range(date_range_str): logging.debug("원본 날짜 범위: %s", date_range_str) parts = date_range_str.split('-') if len(parts) != 2: logging.debug("날짜 범위 형식이 올바르지 않음: %s", date_range_str) return date_range_str start = parts[0].strip() end = parts[1].strip() start_parts = start.split() end_parts = end.split() if len(start_parts) < 2 or len(end_parts) < 2: logging.debug("날짜 구성요소 부족: %s, %s", start, end) return date_range_str start_day = start_parts[0] start_month_en = start_parts[1] end_day = end_parts[0] end_month_en = end_parts[1] start_month = month_mapping.get(start_month_en, start_month_en) end_month = month_mapping.get(end_month_en, end_month_en) converted = f"{start_month} {start_day}일 ~ {end_month} {end_day}일" logging.debug("변환된 날짜 범위: %s", converted) return converted def process_html(html_text): logging.debug("입력 HTML 처리 시작") soup = BeautifulSoup(html_text, "html.parser") subject_elem = soup.find("h1") subject_name = "" if subject_elem: subject_name = subject_elem.get_text(strip=True) logging.debug("추출된 과목명: %s", subject_name) else: logging.debug("h1 태그를 찾지 못함") sections_output = "" section_elements = soup.find_all("li", id=re.compile(r"^section-\d+")) logging.debug("찾은 섹션 개수: %d", len(section_elements)) for section in section_elements: section_id = section.get("id") logging.debug("처리 중인 섹션 ID: %s", section_id) sec_match = re.search(r"section-(\d+)", section_id) if not sec_match: continue sec_num = int(sec_match.group(1)) if sec_num == 0: logging.debug("section-0 은 스킵") continue if sec_num == 1: section_label = "Introduction" else: week_num = sec_num - 1 section_label = f"{week_num}주차" h3_elem = section.find("h3", class_="sectionname") date_range_text = "" if h3_elem and h3_elem.find("a"): header_text = h3_elem.find("a").get_text(strip=True) logging.debug("헤더 텍스트: %s", header_text) date_match = re.search(r'(\d+\s+[A-Za-z]+\s*-\s*\d+\s+[A-Za-z]+)', header_text) if date_match: raw_date_range = date_match.group(1) date_range_text = convert_date_range(raw_date_range) else: logging.debug("날짜 범위 패턴 매칭 실패: %s", header_text) else: logging.debug("h3 또는 h3 내 a 태그를 찾지 못함 for section: %s", section_id) if sec_num == 1: section_heading = f"섹션 : {section_label}" else: section_heading = f"섹션 : {section_label} ({date_range_text})" if date_range_text else f"섹션 : {section_label}" sections_output += section_heading + "\n" iframes = section.find_all("iframe") logging.debug("섹션 %s 내 찾은 iframe 개수: %d", section_id, len(iframes)) for idx, iframe in enumerate(iframes, start=1): video_url = iframe.get("src", "").strip() if video_url: sections_output += f"강의{idx} : {video_url}\n" logging.debug("추출된 동영상 강의 URL: %s", video_url) sections_output += "\n" logging.debug("HTML 처리 완료") return subject_name, sections_output def process_html_sections(html_text): logging.debug("입력 HTML 처리 시작 (섹션별 분리) len=%d", len(html_text) if html_text else 0) soup = BeautifulSoup(html_text, "html.parser") subject_elem = soup.find("h1") subject_name = "" if subject_elem: subject_name = subject_elem.get_text(strip=True) logging.debug("추출된 과목명: %s", subject_name) else: logging.debug("h1 태그를 찾지 못함") sections_dict = {} section_elements = soup.find_all("li", id=re.compile(r"^section-\d+")) logging.debug("찾은 섹션 개수: %d", len(section_elements)) for section in section_elements: section_id = section.get("id") logging.debug("처리 중인 섹션 ID: %s", section_id) sec_match = re.search(r"section-(\d+)", section_id) if not sec_match: continue sec_num = int(sec_match.group(1)) if sec_num == 0: logging.debug("section-0 은 스킵") continue if sec_num == 1: section_label = "섹션 : Introduction" else: week_num = sec_num - 1 section_label = f"섹션 : {week_num}주차" h3_elem = section.find("h3", class_="sectionname") date_range_text = "" if h3_elem and h3_elem.find("a"): header_text = h3_elem.find("a").get_text(strip=True) date_match = re.search(r'(\d+\s+[A-Za-z]+\s*-\s*\d+\s+[A-Za-z]+)', header_text) if date_match: raw_date_range = date_match.group(1) date_range_text = convert_date_range(raw_date_range) if sec_num != 1 and date_range_text: section_label += f" ({date_range_text})" lectures_str = "" url_list = [] iframes = section.find_all("iframe") logging.debug("섹션 %s 내 찾은 iframe 개수: %d", section_id, len(iframes)) for idx, iframe in enumerate(iframes, start=1): video_url = iframe.get("src", "").strip() if video_url: lectures_str += f"강의{idx} : {video_url}\n" url_list.append(video_url) if not url_list: lectures_str = "강의가 없습니다" sections_dict[section_label] = (lectures_str.strip(), url_list) sections_list = list(sections_dict.keys()) default_val = sections_list[0] if sections_list else None return subject_name, gr.update(choices=sections_list, value=default_val), sections_dict def update_lecture_text_only(selected_section, sections_dict): if not selected_section or not sections_dict: return "" lectures_text, _ = sections_dict.get(selected_section, ("", [])) if not lectures_text: lectures_text = "강의가 없습니다" logging.debug("update_lecture_text_only - 선택된 섹션: %s", selected_section) return lectures_text # ---------- [모듈1: 기존 기본코드] 끝 ---------- # ---------- [모듈2: 추가코드] 시작 ---------- def fetch_page_source(url): try: logging.debug("강의 페이지를 가져오는 중: %s", url) response = requests.get(url, timeout=20) response.raise_for_status() logging.debug("페이지 소스를 성공적으로 가져옴 len=%d", len(response.text)) return response.text except Exception as e: logging.error("페이지 소스 가져오기 오류: %s", e, exc_info=True) return "오류 발생: " + str(e) def _normalize_vimeo_url(url_str: str) -> str: if not url_str: return "" return url_str.replace("\\u0026", "&").replace("\\/", "/") def create_script_url(lecture_url): page_source = fetch_page_source(lecture_url) if page_source.startswith("오류 발생:"): return "" pattern = r'"text_tracks"\s*:\s*\[\s*\{[^}]*"url"\s*:\s*"([^"]+)"' match = re.search(pattern, page_source) if match: raw_url = _normalize_vimeo_url(match.group(1)) # 절대 URL이면 그대로 사용 (player.vimeo.comhttps 오류 방지) if raw_url.startswith("https://") or raw_url.startswith("http://"): script_url = raw_url elif raw_url.startswith("//"): script_url = "https:" + raw_url else: script_url = "https://player.vimeo.com" + raw_url logging.debug("스크립트 URL 완성: %s", script_url) return script_url logging.debug("페이지 소스에서 스크립트 URL을 찾지 못함") return "" def fetch_script(script_url): try: logging.debug("스크립트를 가져오는 중: %s", script_url) response = requests.get(script_url, timeout=20) response.raise_for_status() logging.debug("스크립트를 성공적으로 가져옴 len=%d", len(response.text)) return response.text except Exception as e: logging.error("스크립트 가져오기 오류: %s", e, exc_info=True) return "오류 발생: " + str(e) def remove_timeline(script_text, lecture_number): lines = script_text.splitlines() valid_lines = [] for line in lines: stripped_line = line.strip() if stripped_line == "": continue if re.match(r'^\d+$', stripped_line): continue if re.match(r'^\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?\s*-->\s*\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?$', stripped_line): continue valid_lines.append(stripped_line) cleaned_text = "".join(valid_lines) cleaned_text = re.sub(r'\.(\S)', r'. \1', cleaned_text) cleaned_text = re.sub(r'^WEBVTT\s*', '', cleaned_text) return cleaned_text def process_full(lecture_url): script_url = create_script_url(lecture_url) if not script_url: return "스크립트 URL 생성 실패" script_text = fetch_script(script_url) if "오류 발생" in script_text: return script_text cleaned = remove_timeline(script_text, 1) return cleaned # ---------- [모듈2: 추가코드] 끝 ---------- # ---------- [모듈3: 병합/저장] ---------- def merge_contents_global(l1, l2, l3): merged = "" if l1.strip(): merged += "[강의1]\n" + l1.strip() if l2.strip(): if merged: merged += "\n\n" merged += "[강의2]\n" + l2.strip() if l3.strip(): if merged: merged += "\n\n" merged += "[강의3]\n" + l3.strip() return merged def handle_fetch_all(lecture_list_text): logging.debug("강의 내용 가져오기 버튼 클릭됨. len=%d", len(lecture_list_text) if lecture_list_text else 0) lines = (lecture_list_text or "").splitlines() urls = [] for line in lines: m = re.match(r"강의\d+\s*:\s*(.+)", line.strip()) if m: urls.append(m.group(1).strip()) urls = urls[:3] while len(urls) < 3: urls.append("") lec_contents = [] for url in urls: lec_contents.append(process_full(url) if url else "") merged = merge_contents_global(lec_contents[0], lec_contents[1], lec_contents[2]) return urls[0], urls[1], urls[2], lec_contents[0], lec_contents[1], lec_contents[2], merged def _sanitize_filename(name: str) -> str: if not name: return "output" return re.sub(r'[\\/:*?"<>|]+', "_", name).strip() def _extract_section_token(section_label: str) -> str: if not section_label: return "섹션" s = section_label.strip().replace("섹션 : ", "").strip() s = re.sub(r"\s*\(.*\)\s*$", "", s).strip() return s if s else "섹션" def save_merged_to_txt(subject_name, selected_section, merged_text): try: content = merged_text if merged_text is not None else "" subj = subject_name if subject_name else "과목명" sec_token = _extract_section_token(selected_section) filename = _sanitize_filename(f"{subj}(스크립트)_{sec_token}.txt") path = os.path.join(tempfile.gettempdir(), filename) with open(path, "w", encoding="utf-8") as f: # 내용 수정 없이 그대로 저장 f.write(content) logging.debug("TXT 저장 완료: %s", path) return path except Exception as e: logging.exception("save_merged_to_txt 오류: %s", e) return None def load_html_from_file(file_obj): try: logging.debug("load_html_from_file 진입") if file_obj is None: return "", "오류: 업로드된 파일이 없습니다." file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj) logging.debug("파일에서 HTML 로드: %s", file_path) with open(file_path, "r", encoding="utf-8") as f: data = f.read() return data, f"OK: 파일 로드 성공 (len={len(data)})" except Exception as e: logging.exception("load_html_from_file 오류: %s", e) return "", f"오류: {repr(e)}" # ✅ 프론트-백엔드 연결 테스트: 클릭 시 status_out이 반드시 바뀌어야 정상 def test_click(_): logging.debug("### BUTTON CLICKED (test_click) ###") return "clicked" # ---------- Gradio 앱 ---------- with gr.Blocks() as app: gr.Markdown("# 캐롤라인대학 강의 추출기 Ver.2.4") with gr.Tab("HTML 파싱 및 섹션 선택"): with gr.Row(): with gr.Column(): html_input = gr.Textbox(label="전체 페이지 HTML 입력", lines=20, placeholder="HTML 코드를 입력하세요...") html_file = gr.File(label="HTML 파일 업로드(.txt/.html)", file_types=[".txt", ".html", ".htm"]) load_btn = gr.Button("업로드 파일을 입력창에 불러오기") parse_btn = gr.Button("Submit") # ✅ 연결 테스트 버튼 test_btn = gr.Button("연결 테스트(클릭 확인)") status_out = gr.Textbox(label="상태/디버그", interactive=False) with gr.Column(): subject_out = gr.Textbox(label="과목명", interactive=False) section_dropdown = gr.Dropdown(label="섹션 선택", choices=[], interactive=True) lecture_out = gr.Textbox(label="선택한 섹션 강의 목록", lines=10, interactive=False) fetch_all_btn = gr.Button("강의 내용 가져오기", elem_id="fetch_all_btn") sections_state = gr.State() load_btn.click( fn=load_html_from_file, inputs=[html_file], outputs=[html_input, status_out] ) # ✅ 연결 테스트: status_out이 "clicked"로 바뀌어야 함 test_btn.click( fn=test_click, inputs=[html_input], outputs=[status_out] ) parse_btn.click( fn=process_html_sections, inputs=html_input, outputs=[subject_out, section_dropdown, sections_state] ) section_dropdown.change( fn=update_lecture_text_only, inputs=[section_dropdown, sections_state], outputs=lecture_out ) gr.Markdown("## 강의 내용 가져오기") with gr.Row(): url1 = gr.Textbox(label="강의1 URL", elem_id="url1") url2 = gr.Textbox(label="강의2 URL", elem_id="url2") url3 = gr.Textbox(label="강의3 URL", elem_id="url3") with gr.Row(): lecture_content1 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content1") lecture_content2 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content2") lecture_content3 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content3") gr.Markdown("## 강의 내용 합치기") merged_content = gr.Textbox(label="전체 강의 내용", lines=10, elem_id="merged_content") fetch_all_btn.click( fn=handle_fetch_all, inputs=lecture_out, outputs=[url1, url2, url3, lecture_content1, lecture_content2, lecture_content3, merged_content] ) with gr.Row(): save_btn = gr.Button("파일로 저장", elem_id="save_btn") download_file = gr.File(label="저장된 파일 다운로드", interactive=False, elem_id="download_file") save_btn.click( fn=save_merged_to_txt, inputs=[subject_out, section_dropdown, merged_content], outputs=[download_file] ) app.queue() if __name__ == "__main__": logging.debug("통합 Gradio 앱 실행 중") app.launch(debug=True)