Spaces:

unnastyle
/

caroline_script_Ver.3.2

Sleeping

App Files Files Community

caroline_script_Ver.3.2 / app.py

unnastyle

Create app.py

80b36c6 verified 5 months ago

raw

history blame contribute delete

16.7 kB

	print("### APP.PY LOADED ###")

	import gradio as gr
	from bs4 import BeautifulSoup
	import re
	import logging
	import requests
	import os
	import tempfile

	logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")

	# ---------- [모듈1: 기존 기본코드] 시작 ----------
	month_mapping = {
	"January": "1월",
	"February": "2월",
	"March": "3월",
	"April": "4월",
	"May": "5월",
	"June": "6월",
	"July": "7월",
	"August": "8월",
	"September": "9월",
	"October": "10월",
	"November": "11월",
	"December": "12월"
	}

	def convert_date_range(date_range_str):
	logging.debug("원본 날짜 범위: %s", date_range_str)
	parts = date_range_str.split('-')
	if len(parts) != 2:
	logging.debug("날짜 범위 형식이 올바르지 않음: %s", date_range_str)
	return date_range_str
	start = parts[0].strip()
	end = parts[1].strip()
	start_parts = start.split()
	end_parts = end.split()
	if len(start_parts) < 2 or len(end_parts) < 2:
	logging.debug("날짜 구성요소 부족: %s, %s", start, end)
	return date_range_str
	start_day = start_parts[0]
	start_month_en = start_parts[1]
	end_day = end_parts[0]
	end_month_en = end_parts[1]
	start_month = month_mapping.get(start_month_en, start_month_en)
	end_month = month_mapping.get(end_month_en, end_month_en)
	converted = f"{start_month} {start_day}일 ~ {end_month} {end_day}일"
	logging.debug("변환된 날짜 범위: %s", converted)
	return converted

	def process_html(html_text):
	logging.debug("입력 HTML 처리 시작")
	soup = BeautifulSoup(html_text, "html.parser")

	subject_elem = soup.find("h1")
	subject_name = ""
	if subject_elem:
	subject_name = subject_elem.get_text(strip=True)
	logging.debug("추출된 과목명: %s", subject_name)
	else:
	logging.debug("h1 태그를 찾지 못함")

	sections_output = ""
	section_elements = soup.find_all("li", id=re.compile(r"^section-\d+"))
	logging.debug("찾은 섹션 개수: %d", len(section_elements))

	for section in section_elements:
	section_id = section.get("id")
	logging.debug("처리 중인 섹션 ID: %s", section_id)
	sec_match = re.search(r"section-(\d+)", section_id)
	if not sec_match:
	continue
	sec_num = int(sec_match.group(1))
	if sec_num == 0:
	logging.debug("section-0 은 스킵")
	continue

	if sec_num == 1:
	section_label = "Introduction"
	else:
	week_num = sec_num - 1
	section_label = f"{week_num}주차"

	h3_elem = section.find("h3", class_="sectionname")
	date_range_text = ""
	if h3_elem and h3_elem.find("a"):
	header_text = h3_elem.find("a").get_text(strip=True)
	logging.debug("헤더 텍스트: %s", header_text)
	date_match = re.search(r'(\d+\s+[A-Za-z]+\s-\s\d+\s+[A-Za-z]+)', header_text)
	if date_match:
	raw_date_range = date_match.group(1)
	date_range_text = convert_date_range(raw_date_range)
	else:
	logging.debug("날짜 범위 패턴 매칭 실패: %s", header_text)
	else:
	logging.debug("h3 또는 h3 내 a 태그를 찾지 못함 for section: %s", section_id)

	if sec_num == 1:
	section_heading = f"섹션 : {section_label}"
	else:
	section_heading = f"섹션 : {section_label} ({date_range_text})" if date_range_text else f"섹션 : {section_label}"

	sections_output += section_heading + "\n"

	iframes = section.find_all("iframe")
	logging.debug("섹션 %s 내 찾은 iframe 개수: %d", section_id, len(iframes))
	for idx, iframe in enumerate(iframes, start=1):
	video_url = iframe.get("src", "").strip()
	if video_url:
	sections_output += f"강의{idx} : {video_url}\n"
	logging.debug("추출된 동영상 강의 URL: %s", video_url)
	sections_output += "\n"

	logging.debug("HTML 처리 완료")
	return subject_name, sections_output

	def process_html_sections(html_text):
	logging.debug("입력 HTML 처리 시작 (섹션별 분리) len=%d", len(html_text) if html_text else 0)
	soup = BeautifulSoup(html_text, "html.parser")

	subject_elem = soup.find("h1")
	subject_name = ""
	if subject_elem:
	subject_name = subject_elem.get_text(strip=True)
	logging.debug("추출된 과목명: %s", subject_name)
	else:
	logging.debug("h1 태그를 찾지 못함")

	sections_dict = {}
	section_elements = soup.find_all("li", id=re.compile(r"^section-\d+"))
	logging.debug("찾은 섹션 개수: %d", len(section_elements))

	for section in section_elements:
	section_id = section.get("id")
	logging.debug("처리 중인 섹션 ID: %s", section_id)
	sec_match = re.search(r"section-(\d+)", section_id)
	if not sec_match:
	continue
	sec_num = int(sec_match.group(1))
	if sec_num == 0:
	logging.debug("section-0 은 스킵")
	continue

	if sec_num == 1:
	section_label = "섹션 : Introduction"
	else:
	week_num = sec_num - 1
	section_label = f"섹션 : {week_num}주차"

	h3_elem = section.find("h3", class_="sectionname")
	date_range_text = ""
	if h3_elem and h3_elem.find("a"):
	header_text = h3_elem.find("a").get_text(strip=True)
	date_match = re.search(r'(\d+\s+[A-Za-z]+\s-\s\d+\s+[A-Za-z]+)', header_text)
	if date_match:
	raw_date_range = date_match.group(1)
	date_range_text = convert_date_range(raw_date_range)

	if sec_num != 1 and date_range_text:
	section_label += f" ({date_range_text})"

	lectures_str = ""
	url_list = []
	iframes = section.find_all("iframe")
	logging.debug("섹션 %s 내 찾은 iframe 개수: %d", section_id, len(iframes))
	for idx, iframe in enumerate(iframes, start=1):
	video_url = iframe.get("src", "").strip()
	if video_url:
	lectures_str += f"강의{idx} : {video_url}\n"
	url_list.append(video_url)

	if not url_list:
	lectures_str = "강의가 없습니다"

	sections_dict[section_label] = (lectures_str.strip(), url_list)

	sections_list = list(sections_dict.keys())
	default_val = sections_list[0] if sections_list else None
	return subject_name, gr.update(choices=sections_list, value=default_val), sections_dict

	def update_lecture_text_only(selected_section, sections_dict):
	if not selected_section or not sections_dict:
	return ""
	lectures_text, _ = sections_dict.get(selected_section, ("", []))
	if not lectures_text:
	lectures_text = "강의가 없습니다"
	logging.debug("update_lecture_text_only - 선택된 섹션: %s", selected_section)
	return lectures_text
	# ---------- [모듈1: 기존 기본코드] 끝 ----------

	# ---------- [모듈2: 추가코드] 시작 ----------
	def fetch_page_source(url):
	try:
	logging.debug("강의 페이지를 가져오는 중: %s", url)
	response = requests.get(url, timeout=20)
	response.raise_for_status()
	logging.debug("페이지 소스를 성공적으로 가져옴 len=%d", len(response.text))
	return response.text
	except Exception as e:
	logging.error("페이지 소스 가져오기 오류: %s", e, exc_info=True)
	return "오류 발생: " + str(e)

	def _normalize_vimeo_url(url_str: str) -> str:
	if not url_str:
	return ""
	return url_str.replace("\\u0026", "&").replace("\\/", "/")

	def create_script_url(lecture_url):
	page_source = fetch_page_source(lecture_url)
	if page_source.startswith("오류 발생:"):
	return ""

	pattern = r'"text_tracks"\s:\s\[\s\{[^}]"url"\s:\s"([^"]+)"'
	match = re.search(pattern, page_source)
	if match:
	raw_url = _normalize_vimeo_url(match.group(1))

	# 절대 URL이면 그대로 사용 (player.vimeo.comhttps 오류 방지)
	if raw_url.startswith("https://") or raw_url.startswith("http://"):
	script_url = raw_url
	elif raw_url.startswith("//"):
	script_url = "https:" + raw_url
	else:
	script_url = "https://player.vimeo.com" + raw_url

	logging.debug("스크립트 URL 완성: %s", script_url)
	return script_url

	logging.debug("페이지 소스에서 스크립트 URL을 찾지 못함")
	return ""

	def fetch_script(script_url):
	try:
	logging.debug("스크립트를 가져오는 중: %s", script_url)
	response = requests.get(script_url, timeout=20)
	response.raise_for_status()
	logging.debug("스크립트를 성공적으로 가져옴 len=%d", len(response.text))
	return response.text
	except Exception as e:
	logging.error("스크립트 가져오기 오류: %s", e, exc_info=True)
	return "오류 발생: " + str(e)

	def remove_timeline(script_text, lecture_number):
	lines = script_text.splitlines()
	valid_lines = []
	for line in lines:
	stripped_line = line.strip()
	if stripped_line == "":
	continue
	if re.match(r'^\d+$', stripped_line):
	continue
	if re.match(r'^\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?\s-->\s\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?$', stripped_line):
	continue
	valid_lines.append(stripped_line)
	cleaned_text = "".join(valid_lines)
	cleaned_text = re.sub(r'\.(\S)', r'. \1', cleaned_text)
	cleaned_text = re.sub(r'^WEBVTT\s*', '', cleaned_text)
	return cleaned_text

	def process_full(lecture_url):
	script_url = create_script_url(lecture_url)
	if not script_url:
	return "스크립트 URL 생성 실패"

	script_text = fetch_script(script_url)
	if "오류 발생" in script_text:
	return script_text

	cleaned = remove_timeline(script_text, 1)
	return cleaned
	# ---------- [모듈2: 추가코드] 끝 ----------

	# ---------- [모듈3: 병합/저장] ----------
	def merge_contents_global(l1, l2, l3):
	merged = ""
	if l1.strip():
	merged += "[강의1]\n" + l1.strip()
	if l2.strip():
	if merged:
	merged += "\n\n"
	merged += "[강의2]\n" + l2.strip()
	if l3.strip():
	if merged:
	merged += "\n\n"
	merged += "[강의3]\n" + l3.strip()
	return merged

	def handle_fetch_all(lecture_list_text):
	logging.debug("강의 내용 가져오기 버튼 클릭됨. len=%d", len(lecture_list_text) if lecture_list_text else 0)
	lines = (lecture_list_text or "").splitlines()
	urls = []
	for line in lines:
	m = re.match(r"강의\d+\s:\s(.+)", line.strip())
	if m:
	urls.append(m.group(1).strip())

	urls = urls[:3]
	while len(urls) < 3:
	urls.append("")

	lec_contents = []
	for url in urls:
	lec_contents.append(process_full(url) if url else "")

	merged = merge_contents_global(lec_contents[0], lec_contents[1], lec_contents[2])
	return urls[0], urls[1], urls[2], lec_contents[0], lec_contents[1], lec_contents[2], merged

	def _sanitize_filename(name: str) -> str:
	if not name:
	return "output"
	return re.sub(r'[\\/:*?"<>\|]+', "_", name).strip()

	def _extract_section_token(section_label: str) -> str:
	if not section_label:
	return "섹션"
	s = section_label.strip().replace("섹션 : ", "").strip()
	s = re.sub(r"\s$.$\s*$", "", s).strip()
	return s if s else "섹션"

	def save_merged_to_txt(subject_name, selected_section, merged_text):
	try:
	content = merged_text if merged_text is not None else ""
	subj = subject_name if subject_name else "과목명"
	sec_token = _extract_section_token(selected_section)
	filename = _sanitize_filename(f"{subj}(스크립트)_{sec_token}.txt")

	path = os.path.join(tempfile.gettempdir(), filename)
	with open(path, "w", encoding="utf-8") as f:
	# 내용 수정 없이 그대로 저장
	f.write(content)

	logging.debug("TXT 저장 완료: %s", path)
	return path
	except Exception as e:
	logging.exception("save_merged_to_txt 오류: %s", e)
	return None

	def load_html_from_file(file_obj):
	try:
	logging.debug("load_html_from_file 진입")
	if file_obj is None:
	return "", "오류: 업로드된 파일이 없습니다."
	file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
	logging.debug("파일에서 HTML 로드: %s", file_path)
	with open(file_path, "r", encoding="utf-8") as f:
	data = f.read()
	return data, f"OK: 파일 로드 성공 (len={len(data)})"
	except Exception as e:
	logging.exception("load_html_from_file 오류: %s", e)
	return "", f"오류: {repr(e)}"

	# ✅ 프론트-백엔드 연결 테스트: 클릭 시 status_out이 반드시 바뀌어야 정상
	def test_click(_):
	logging.debug("### BUTTON CLICKED (test_click) ###")
	return "clicked"


	# ---------- Gradio 앱 ----------
	with gr.Blocks() as app:
	gr.Markdown("# 캐롤라인대학 강의 추출기 Ver.2.4")

	with gr.Tab("HTML 파싱 및 섹션 선택"):
	with gr.Row():
	with gr.Column():
	html_input = gr.Textbox(label="전체 페이지 HTML 입력", lines=20, placeholder="HTML 코드를 입력하세요...")

	html_file = gr.File(label="HTML 파일 업로드(.txt/.html)", file_types=[".txt", ".html", ".htm"])
	load_btn = gr.Button("업로드 파일을 입력창에 불러오기")

	parse_btn = gr.Button("Submit")

	# ✅ 연결 테스트 버튼
	test_btn = gr.Button("연결 테스트(클릭 확인)")
	status_out = gr.Textbox(label="상태/디버그", interactive=False)

	with gr.Column():
	subject_out = gr.Textbox(label="과목명", interactive=False)
	section_dropdown = gr.Dropdown(label="섹션 선택", choices=[], interactive=True)
	lecture_out = gr.Textbox(label="선택한 섹션 강의 목록", lines=10, interactive=False)
	fetch_all_btn = gr.Button("강의 내용 가져오기", elem_id="fetch_all_btn")

	sections_state = gr.State()

	load_btn.click(
	fn=load_html_from_file,
	inputs=[html_file],
	outputs=[html_input, status_out]
	)

	# ✅ 연결 테스트: status_out이 "clicked"로 바뀌어야 함
	test_btn.click(
	fn=test_click,
	inputs=[html_input],
	outputs=[status_out]
	)

	parse_btn.click(
	fn=process_html_sections,
	inputs=html_input,
	outputs=[subject_out, section_dropdown, sections_state]
	)

	section_dropdown.change(
	fn=update_lecture_text_only,
	inputs=[section_dropdown, sections_state],
	outputs=lecture_out
	)

	gr.Markdown("## 강의 내용 가져오기")
	with gr.Row():
	url1 = gr.Textbox(label="강의1 URL", elem_id="url1")
	url2 = gr.Textbox(label="강의2 URL", elem_id="url2")
	url3 = gr.Textbox(label="강의3 URL", elem_id="url3")

	with gr.Row():
	lecture_content1 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content1")
	lecture_content2 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content2")
	lecture_content3 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content3")

	gr.Markdown("## 강의 내용 합치기")
	merged_content = gr.Textbox(label="전체 강의 내용", lines=10, elem_id="merged_content")

	fetch_all_btn.click(
	fn=handle_fetch_all,
	inputs=lecture_out,
	outputs=[url1, url2, url3, lecture_content1, lecture_content2, lecture_content3, merged_content]
	)

	with gr.Row():
	save_btn = gr.Button("파일로 저장", elem_id="save_btn")
	download_file = gr.File(label="저장된 파일 다운로드", interactive=False, elem_id="download_file")

	save_btn.click(
	fn=save_merged_to_txt,
	inputs=[subject_out, section_dropdown, merged_content],
	outputs=[download_file]
	)

	app.queue()

	if __name__ == "__main__":
	logging.debug("통합 Gradio 앱 실행 중")
	app.launch(debug=True)