Spaces:

unnastyle
/

caroline_script_Ver.3.2

Sleeping

caroline_script_Ver.3.2

File size: 16,671 Bytes

80b36c6

print("### APP.PY LOADED ###")

import gradio as gr
from bs4 import BeautifulSoup
import re
import logging
import requests
import os
import tempfile

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")

# ---------- [모듈1: 기존 기본코드] 시작 ----------
month_mapping = {
    "January": "1월",
    "February": "2월",
    "March": "3월",
    "April": "4월",
    "May": "5월",
    "June": "6월",
    "July": "7월",
    "August": "8월",
    "September": "9월",
    "October": "10월",
    "November": "11월",
    "December": "12월"
}

def convert_date_range(date_range_str):
    logging.debug("원본 날짜 범위: %s", date_range_str)
    parts = date_range_str.split('-')
    if len(parts) != 2:
        logging.debug("날짜 범위 형식이 올바르지 않음: %s", date_range_str)
        return date_range_str
    start = parts[0].strip()
    end = parts[1].strip()
    start_parts = start.split()
    end_parts = end.split()
    if len(start_parts) < 2 or len(end_parts) < 2:
        logging.debug("날짜 구성요소 부족: %s, %s", start, end)
        return date_range_str
    start_day = start_parts[0]
    start_month_en = start_parts[1]
    end_day = end_parts[0]
    end_month_en = end_parts[1]
    start_month = month_mapping.get(start_month_en, start_month_en)
    end_month = month_mapping.get(end_month_en, end_month_en)
    converted = f"{start_month} {start_day}일 ~ {end_month} {end_day}일"
    logging.debug("변환된 날짜 범위: %s", converted)
    return converted

def process_html(html_text):
    logging.debug("입력 HTML 처리 시작")
    soup = BeautifulSoup(html_text, "html.parser")

    subject_elem = soup.find("h1")
    subject_name = ""
    if subject_elem:
        subject_name = subject_elem.get_text(strip=True)
        logging.debug("추출된 과목명: %s", subject_name)
    else:
        logging.debug("h1 태그를 찾지 못함")

    sections_output = ""
    section_elements = soup.find_all("li", id=re.compile(r"^section-\d+"))
    logging.debug("찾은 섹션 개수: %d", len(section_elements))

    for section in section_elements:
        section_id = section.get("id")
        logging.debug("처리 중인 섹션 ID: %s", section_id)
        sec_match = re.search(r"section-(\d+)", section_id)
        if not sec_match:
            continue
        sec_num = int(sec_match.group(1))
        if sec_num == 0:
            logging.debug("section-0 은 스킵")
            continue

        if sec_num == 1:
            section_label = "Introduction"
        else:
            week_num = sec_num - 1
            section_label = f"{week_num}주차"

        h3_elem = section.find("h3", class_="sectionname")
        date_range_text = ""
        if h3_elem and h3_elem.find("a"):
            header_text = h3_elem.find("a").get_text(strip=True)
            logging.debug("헤더 텍스트: %s", header_text)
            date_match = re.search(r'(\d+\s+[A-Za-z]+\s*-\s*\d+\s+[A-Za-z]+)', header_text)
            if date_match:
                raw_date_range = date_match.group(1)
                date_range_text = convert_date_range(raw_date_range)
            else:
                logging.debug("날짜 범위 패턴 매칭 실패: %s", header_text)
        else:
            logging.debug("h3 또는 h3 내 a 태그를 찾지 못함 for section: %s", section_id)

        if sec_num == 1:
            section_heading = f"섹션 : {section_label}"
        else:
            section_heading = f"섹션 : {section_label} ({date_range_text})" if date_range_text else f"섹션 : {section_label}"

        sections_output += section_heading + "\n"

        iframes = section.find_all("iframe")
        logging.debug("섹션 %s 내 찾은 iframe 개수: %d", section_id, len(iframes))
        for idx, iframe in enumerate(iframes, start=1):
            video_url = iframe.get("src", "").strip()
            if video_url:
                sections_output += f"강의{idx} : {video_url}\n"
                logging.debug("추출된 동영상 강의 URL: %s", video_url)
        sections_output += "\n"

    logging.debug("HTML 처리 완료")
    return subject_name, sections_output

def process_html_sections(html_text):
    logging.debug("입력 HTML 처리 시작 (섹션별 분리) len=%d", len(html_text) if html_text else 0)
    soup = BeautifulSoup(html_text, "html.parser")

    subject_elem = soup.find("h1")
    subject_name = ""
    if subject_elem:
        subject_name = subject_elem.get_text(strip=True)
        logging.debug("추출된 과목명: %s", subject_name)
    else:
        logging.debug("h1 태그를 찾지 못함")

    sections_dict = {}
    section_elements = soup.find_all("li", id=re.compile(r"^section-\d+"))
    logging.debug("찾은 섹션 개수: %d", len(section_elements))

    for section in section_elements:
        section_id = section.get("id")
        logging.debug("처리 중인 섹션 ID: %s", section_id)
        sec_match = re.search(r"section-(\d+)", section_id)
        if not sec_match:
            continue
        sec_num = int(sec_match.group(1))
        if sec_num == 0:
            logging.debug("section-0 은 스킵")
            continue

        if sec_num == 1:
            section_label = "섹션 : Introduction"
        else:
            week_num = sec_num - 1
            section_label = f"섹션 : {week_num}주차"

        h3_elem = section.find("h3", class_="sectionname")
        date_range_text = ""
        if h3_elem and h3_elem.find("a"):
            header_text = h3_elem.find("a").get_text(strip=True)
            date_match = re.search(r'(\d+\s+[A-Za-z]+\s*-\s*\d+\s+[A-Za-z]+)', header_text)
            if date_match:
                raw_date_range = date_match.group(1)
                date_range_text = convert_date_range(raw_date_range)

        if sec_num != 1 and date_range_text:
            section_label += f" ({date_range_text})"

        lectures_str = ""
        url_list = []
        iframes = section.find_all("iframe")
        logging.debug("섹션 %s 내 찾은 iframe 개수: %d", section_id, len(iframes))
        for idx, iframe in enumerate(iframes, start=1):
            video_url = iframe.get("src", "").strip()
            if video_url:
                lectures_str += f"강의{idx} : {video_url}\n"
                url_list.append(video_url)

        if not url_list:
            lectures_str = "강의가 없습니다"

        sections_dict[section_label] = (lectures_str.strip(), url_list)

    sections_list = list(sections_dict.keys())
    default_val = sections_list[0] if sections_list else None
    return subject_name, gr.update(choices=sections_list, value=default_val), sections_dict

def update_lecture_text_only(selected_section, sections_dict):
    if not selected_section or not sections_dict:
        return ""
    lectures_text, _ = sections_dict.get(selected_section, ("", []))
    if not lectures_text:
        lectures_text = "강의가 없습니다"
    logging.debug("update_lecture_text_only - 선택된 섹션: %s", selected_section)
    return lectures_text
# ---------- [모듈1: 기존 기본코드] 끝 ----------

# ---------- [모듈2: 추가코드] 시작 ----------
def fetch_page_source(url):
    try:
        logging.debug("강의 페이지를 가져오는 중: %s", url)
        response = requests.get(url, timeout=20)
        response.raise_for_status()
        logging.debug("페이지 소스를 성공적으로 가져옴 len=%d", len(response.text))
        return response.text
    except Exception as e:
        logging.error("페이지 소스 가져오기 오류: %s", e, exc_info=True)
        return "오류 발생: " + str(e)

def _normalize_vimeo_url(url_str: str) -> str:
    if not url_str:
        return ""
    return url_str.replace("\\u0026", "&").replace("\\/", "/")

def create_script_url(lecture_url):
    page_source = fetch_page_source(lecture_url)
    if page_source.startswith("오류 발생:"):
        return ""

    pattern = r'"text_tracks"\s*:\s*\[\s*\{[^}]*"url"\s*:\s*"([^"]+)"'
    match = re.search(pattern, page_source)
    if match:
        raw_url = _normalize_vimeo_url(match.group(1))

        # 절대 URL이면 그대로 사용 (player.vimeo.comhttps 오류 방지)
        if raw_url.startswith("https://") or raw_url.startswith("http://"):
            script_url = raw_url
        elif raw_url.startswith("//"):
            script_url = "https:" + raw_url
        else:
            script_url = "https://player.vimeo.com" + raw_url

        logging.debug("스크립트 URL 완성: %s", script_url)
        return script_url

    logging.debug("페이지 소스에서 스크립트 URL을 찾지 못함")
    return ""

def fetch_script(script_url):
    try:
        logging.debug("스크립트를 가져오는 중: %s", script_url)
        response = requests.get(script_url, timeout=20)
        response.raise_for_status()
        logging.debug("스크립트를 성공적으로 가져옴 len=%d", len(response.text))
        return response.text
    except Exception as e:
        logging.error("스크립트 가져오기 오류: %s", e, exc_info=True)
        return "오류 발생: " + str(e)

def remove_timeline(script_text, lecture_number):
    lines = script_text.splitlines()
    valid_lines = []
    for line in lines:
        stripped_line = line.strip()
        if stripped_line == "":
            continue
        if re.match(r'^\d+$', stripped_line):
            continue
        if re.match(r'^\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?\s*-->\s*\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?$', stripped_line):
            continue
        valid_lines.append(stripped_line)
    cleaned_text = "".join(valid_lines)
    cleaned_text = re.sub(r'\.(\S)', r'. \1', cleaned_text)
    cleaned_text = re.sub(r'^WEBVTT\s*', '', cleaned_text)
    return cleaned_text

def process_full(lecture_url):
    script_url = create_script_url(lecture_url)
    if not script_url:
        return "스크립트 URL 생성 실패"

    script_text = fetch_script(script_url)
    if "오류 발생" in script_text:
        return script_text

    cleaned = remove_timeline(script_text, 1)
    return cleaned
# ---------- [모듈2: 추가코드] 끝 ----------

# ---------- [모듈3: 병합/저장] ----------
def merge_contents_global(l1, l2, l3):
    merged = ""
    if l1.strip():
        merged += "[강의1]\n" + l1.strip()
    if l2.strip():
        if merged:
            merged += "\n\n"
        merged += "[강의2]\n" + l2.strip()
    if l3.strip():
        if merged:
            merged += "\n\n"
        merged += "[강의3]\n" + l3.strip()
    return merged

def handle_fetch_all(lecture_list_text):
    logging.debug("강의 내용 가져오기 버튼 클릭됨. len=%d", len(lecture_list_text) if lecture_list_text else 0)
    lines = (lecture_list_text or "").splitlines()
    urls = []
    for line in lines:
        m = re.match(r"강의\d+\s*:\s*(.+)", line.strip())
        if m:
            urls.append(m.group(1).strip())

    urls = urls[:3]
    while len(urls) < 3:
        urls.append("")

    lec_contents = []
    for url in urls:
        lec_contents.append(process_full(url) if url else "")

    merged = merge_contents_global(lec_contents[0], lec_contents[1], lec_contents[2])
    return urls[0], urls[1], urls[2], lec_contents[0], lec_contents[1], lec_contents[2], merged

def _sanitize_filename(name: str) -> str:
    if not name:
        return "output"
    return re.sub(r'[\\/:*?"<>|]+', "_", name).strip()

def _extract_section_token(section_label: str) -> str:
    if not section_label:
        return "섹션"
    s = section_label.strip().replace("섹션 : ", "").strip()
    s = re.sub(r"\s*\(.*\)\s*$", "", s).strip()
    return s if s else "섹션"

def save_merged_to_txt(subject_name, selected_section, merged_text):
    try:
        content = merged_text if merged_text is not None else ""
        subj = subject_name if subject_name else "과목명"
        sec_token = _extract_section_token(selected_section)
        filename = _sanitize_filename(f"{subj}(스크립트)_{sec_token}.txt")

        path = os.path.join(tempfile.gettempdir(), filename)
        with open(path, "w", encoding="utf-8") as f:
            # 내용 수정 없이 그대로 저장
            f.write(content)

        logging.debug("TXT 저장 완료: %s", path)
        return path
    except Exception as e:
        logging.exception("save_merged_to_txt 오류: %s", e)
        return None

def load_html_from_file(file_obj):
    try:
        logging.debug("load_html_from_file 진입")
        if file_obj is None:
            return "", "오류: 업로드된 파일이 없습니다."
        file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
        logging.debug("파일에서 HTML 로드: %s", file_path)
        with open(file_path, "r", encoding="utf-8") as f:
            data = f.read()
        return data, f"OK: 파일 로드 성공 (len={len(data)})"
    except Exception as e:
        logging.exception("load_html_from_file 오류: %s", e)
        return "", f"오류: {repr(e)}"

# ✅ 프론트-백엔드 연결 테스트: 클릭 시 status_out이 반드시 바뀌어야 정상
def test_click(_):
    logging.debug("### BUTTON CLICKED (test_click) ###")
    return "clicked"


# ---------- Gradio 앱 ----------
with gr.Blocks() as app:
    gr.Markdown("# 캐롤라인대학 강의 추출기 Ver.2.4")

    with gr.Tab("HTML 파싱 및 섹션 선택"):
        with gr.Row():
            with gr.Column():
                html_input = gr.Textbox(label="전체 페이지 HTML 입력", lines=20, placeholder="HTML 코드를 입력하세요...")

                html_file = gr.File(label="HTML 파일 업로드(.txt/.html)", file_types=[".txt", ".html", ".htm"])
                load_btn = gr.Button("업로드 파일을 입력창에 불러오기")

                parse_btn = gr.Button("Submit")

                # ✅ 연결 테스트 버튼
                test_btn = gr.Button("연결 테스트(클릭 확인)")
                status_out = gr.Textbox(label="상태/디버그", interactive=False)

            with gr.Column():
                subject_out = gr.Textbox(label="과목명", interactive=False)
                section_dropdown = gr.Dropdown(label="섹션 선택", choices=[], interactive=True)
                lecture_out = gr.Textbox(label="선택한 섹션 강의 목록", lines=10, interactive=False)
                fetch_all_btn = gr.Button("강의 내용 가져오기", elem_id="fetch_all_btn")

        sections_state = gr.State()

        load_btn.click(
            fn=load_html_from_file,
            inputs=[html_file],
            outputs=[html_input, status_out]
        )

        # ✅ 연결 테스트: status_out이 "clicked"로 바뀌어야 함
        test_btn.click(
            fn=test_click,
            inputs=[html_input],
            outputs=[status_out]
        )

        parse_btn.click(
            fn=process_html_sections,
            inputs=html_input,
            outputs=[subject_out, section_dropdown, sections_state]
        )

        section_dropdown.change(
            fn=update_lecture_text_only,
            inputs=[section_dropdown, sections_state],
            outputs=lecture_out
        )

    gr.Markdown("## 강의 내용 가져오기")
    with gr.Row():
        url1 = gr.Textbox(label="강의1 URL", elem_id="url1")
        url2 = gr.Textbox(label="강의2 URL", elem_id="url2")
        url3 = gr.Textbox(label="강의3 URL", elem_id="url3")

    with gr.Row():
        lecture_content1 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content1")
        lecture_content2 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content2")
        lecture_content3 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content3")

    gr.Markdown("## 강의 내용 합치기")
    merged_content = gr.Textbox(label="전체 강의 내용", lines=10, elem_id="merged_content")

    fetch_all_btn.click(
        fn=handle_fetch_all,
        inputs=lecture_out,
        outputs=[url1, url2, url3, lecture_content1, lecture_content2, lecture_content3, merged_content]
    )

    with gr.Row():
        save_btn = gr.Button("파일로 저장", elem_id="save_btn")
        download_file = gr.File(label="저장된 파일 다운로드", interactive=False, elem_id="download_file")

    save_btn.click(
        fn=save_merged_to_txt,
        inputs=[subject_out, section_dropdown, merged_content],
        outputs=[download_file]
    )

app.queue()

if __name__ == "__main__":
    logging.debug("통합 Gradio 앱 실행 중")
    app.launch(debug=True)