Spaces:

unnastyle
/

caroline_script_Ver.2.2

Sleeping

App Files Files Community

caroline_script_Ver.2.2 / app.py

unnastyle

Update app.py

92c0854 verified about 1 year ago

raw

history blame contribute delete

23.2 kB

	import gradio as gr
	from bs4 import BeautifulSoup
	import re
	import logging
	import requests

	# 디버깅 로깅 설정 (전체 통합을 위해 포맷 포함)
	logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")

	# ---------- [모듈1: 기존 기본코드] 시작 ----------
	# 영어 월을 한국어 월로 변환하기 위한 매핑
	month_mapping = {
	"January": "1월",
	"February": "2월",
	"March": "3월",
	"April": "4월",
	"May": "5월",
	"June": "6월",
	"July": "7월",
	"August": "8월",
	"September": "9월",
	"October": "10월",
	"November": "11월",
	"December": "12월"
	}

	def convert_date_range(date_range_str):
	"""
	입력된 '6 January - 12 January' 형태의 날짜 문자열을
	'1월 6일 ~ 1월 12일' 형태로 변환하는 함수.
	"""
	logging.debug("원본 날짜 범위: %s", date_range_str)
	parts = date_range_str.split('-')
	if len(parts) != 2:
	logging.debug("날짜 범위 형식이 올바르지 않음: %s", date_range_str)
	return date_range_str
	start = parts[0].strip() # 예: "6 January"
	end = parts[1].strip() # 예: "12 January"
	start_parts = start.split()
	end_parts = end.split()
	if len(start_parts) < 2 or len(end_parts) < 2:
	logging.debug("날짜 구성요소 부족: %s, %s", start, end)
	return date_range_str
	start_day = start_parts[0]
	start_month_en = start_parts[1]
	end_day = end_parts[0]
	end_month_en = end_parts[1]
	start_month = month_mapping.get(start_month_en, start_month_en)
	end_month = month_mapping.get(end_month_en, end_month_en)
	converted = f"{start_month} {start_day}일 ~ {end_month} {end_day}일"
	logging.debug("변환된 날짜 범위: %s", converted)
	return converted

	def process_html(html_text):
	"""
	[기존 기능]
	전체 페이지 HTML 입력을 받아서 과목명과 각 섹션별 동영상 강의 목록을 추출하는 함수.
	"""
	logging.debug("입력 HTML 처리 시작")
	soup = BeautifulSoup(html_text, "html.parser")

	# 1. 과목명 추출 (h1 태그 사용)
	subject_elem = soup.find("h1")
	subject_name = ""
	if subject_elem:
	subject_name = subject_elem.get_text(strip=True)
	logging.debug("추출된 과목명: %s", subject_name)
	else:
	logging.debug("h1 태그를 찾지 못함")

	# 2. 섹션별 동영상 강의 목록 추출
	sections_output = ""
	section_elements = soup.find_all("li", id=re.compile(r"^section-\d+"))
	logging.debug("찾은 섹션 개수: %d", len(section_elements))

	for section in section_elements:
	section_id = section.get("id")
	logging.debug("처리 중인 섹션 ID: %s", section_id)
	sec_match = re.search(r"section-(\d+)", section_id)
	if not sec_match:
	continue
	sec_num = int(sec_match.group(1))
	if sec_num == 0:
	logging.debug("section-0 은 스킵")
	continue

	# 섹션 라벨 지정
	if sec_num == 1:
	section_label = "Introduction"
	else:
	week_num = sec_num - 1 # section-2부터 1주차, section-3은 2주차 등
	section_label = f"{week_num}주차"

	# 섹션 헤더에서 날짜 범위 추출 (h3 태그 내 <a> 태그 텍스트)
	h3_elem = section.find("h3", class_="sectionname")
	date_range_text = ""
	if h3_elem and h3_elem.find("a"):
	header_text = h3_elem.find("a").get_text(strip=True)
	logging.debug("헤더 텍스트: %s", header_text)
	date_match = re.search(r'(\d+\s+[A-Za-z]+\s-\s\d+\s+[A-Za-z]+)', header_text)
	if date_match:
	raw_date_range = date_match.group(1)
	date_range_text = convert_date_range(raw_date_range)
	else:
	logging.debug("날짜 범위 패턴 매칭 실패: %s", header_text)
	else:
	logging.debug("h3 또는 h3 내 a 태그를 찾지 못함 for section: %s", section_id)

	if sec_num == 1:
	section_heading = f"섹션 : {section_label}"
	else:
	if date_range_text:
	section_heading = f"섹션 : {section_label} ({date_range_text})"
	else:
	section_heading = f"섹션 : {section_label}"

	sections_output += section_heading + "\n"

	# 해당 섹션 내 iframe 태그로부터 동영상 강의 URL 추출
	iframes = section.find_all("iframe")
	logging.debug("섹션 %s 내 찾은 iframe 개수: %d", section_id, len(iframes))
	for idx, iframe in enumerate(iframes, start=1):
	video_url = iframe.get("src", "").strip()
	if video_url:
	sections_output += f"강의{idx} : {video_url}\n"
	logging.debug("추출된 동영상 강의 URL: %s", video_url)
	sections_output += "\n"

	logging.debug("HTML 처리 완료")
	return subject_name, sections_output

	def process_html_sections(html_text):
	"""
	입력된 전체 페이지 HTML에서
	- 과목명을 추출하고,
	- 각 섹션별로 강의 목록(텍스트 및 URL 리스트)을 분리하여 딕셔너리(섹션 제목: (강의목록 텍스트, URL리스트))를 생성하는 함수.
	"""
	logging.debug("입력 HTML 처리 시작 (섹션별 분리)")
	soup = BeautifulSoup(html_text, "html.parser")

	# 1. 과목명 추출
	subject_elem = soup.find("h1")
	subject_name = ""
	if subject_elem:
	subject_name = subject_elem.get_text(strip=True)
	logging.debug("추출된 과목명: %s", subject_name)
	else:
	logging.debug("h1 태그를 찾지 못함")

	sections_dict = {}
	section_elements = soup.find_all("li", id=re.compile(r"^section-\d+"))
	logging.debug("찾은 섹션 개수: %d", len(section_elements))

	for section in section_elements:
	section_id = section.get("id")
	logging.debug("처리 중인 섹션 ID: %s", section_id)
	sec_match = re.search(r"section-(\d+)", section_id)
	if not sec_match:
	continue
	sec_num = int(sec_match.group(1))
	if sec_num == 0:
	logging.debug("section-0 은 스킵")
	continue

	# 섹션 라벨 지정
	if sec_num == 1:
	section_label = "섹션 : Introduction"
	else:
	week_num = sec_num - 1
	section_label = f"섹션 : {week_num}주차"

	# 섹션 헤더에서 날짜 범위 추출
	h3_elem = section.find("h3", class_="sectionname")
	date_range_text = ""
	if h3_elem and h3_elem.find("a"):
	header_text = h3_elem.find("a").get_text(strip=True)
	logging.debug("헤더 텍스트: %s", header_text)
	date_match = re.search(r'(\d+\s+[A-Za-z]+\s-\s\d+\s+[A-Za-z]+)', header_text)
	if date_match:
	raw_date_range = date_match.group(1)
	date_range_text = convert_date_range(raw_date_range)
	else:
	logging.debug("날짜 범위 패턴 매칭 실패: %s", header_text)
	else:
	logging.debug("h3 또는 h3 내 a 태그를 찾지 못함 for section: %s", section_id)

	if sec_num != 1 and date_range_text:
	section_label += f" ({date_range_text})"

	# 강의 텍스트와 URL 리스트 추출
	lectures_str = ""
	url_list = []
	iframes = section.find_all("iframe")
	logging.debug("섹션 %s 내 찾은 iframe 개수: %d", section_id, len(iframes))
	for idx, iframe in enumerate(iframes, start=1):
	video_url = iframe.get("src", "").strip()
	if video_url:
	lectures_str += f"강의{idx} : {video_url}\n"
	url_list.append(video_url)
	logging.debug("추출된 동영상 강의 URL: %s", video_url)
	# 강의가 없으면 강의 텍스트에 안내 메시지 추가
	if not url_list:
	lectures_str = "강의가 없습니다"
	sections_dict[section_label] = (lectures_str.strip(), url_list)

	logging.debug("HTML 처리 완료 (섹션별 분리)")
	sections_list = list(sections_dict.keys())
	default_val = sections_list[0] if sections_list else None
	# gr.Dropdown.update() 대신 gr.update() 사용
	return subject_name, gr.update(choices=sections_list, value=default_val), sections_dict

	def update_lecture_and_urls(selected_section, sections_dict):
	"""
	선택한 섹션을 바탕으로 강의 목록 텍스트와 URL들을 추출하여
	- 강의 목록 텍스트(없으면 "강의가 없습니다" 반환)
	- 최대 3개의 강의 URL (강의 URL이 없으면 빈 문자열)
	를 반환.
	"""
	if not selected_section or not sections_dict:
	return "", "", "", ""
	lectures_text, url_list = sections_dict.get(selected_section, ("", []))
	if not lectures_text:
	lectures_text = "강의가 없습니다"
	# 강의 URL 3개 채우기 (없으면 빈 문자열)
	url1 = url_list[0] if len(url_list) >= 1 else ""
	url2 = url_list[1] if len(url_list) >= 2 else ""
	url3 = url_list[2] if len(url_list) >= 3 else ""
	return lectures_text, url1, url2, url3

	def update_lecture_text_only(selected_section, sections_dict):
	"""
	선택한 섹션에 해당하는 강의목록 텍스트만 반환하는 함수.
	"""
	if not selected_section or not sections_dict:
	return ""
	lectures_text, _ = sections_dict.get(selected_section, ("", []))
	if not lectures_text:
	lectures_text = "강의가 없습니다"
	logging.debug("update_lecture_text_only - 선택된 섹션: %s, 강의목록: %s", selected_section, lectures_text)
	return lectures_text
	# ---------- [모듈1: 기존 기본코드] 끝 ----------

	# ---------- [모듈2: 추가코드] 시작 ----------
	def fetch_page_source(url):
	try:
	logging.debug(f"강의 페이지를 가져오는 중: {url}")
	response = requests.get(url)
	response.raise_for_status()
	logging.debug("페이지 소스를 성공적으로 가져옴")
	return response.text
	except Exception as e:
	logging.error(f"페이지 소스 가져오기 오류: {e}")
	return "오류 발생: " + str(e)

	def create_script_url(lecture_url):
	"""
	입력받은 강의 URL의 페이지 소스에서 스크립트 태그 예시 부분을 찾아
	"text_tracks" 내부의 "url" 값을 추출한 후, 'https://player.vimeo.com'을 앞에 붙여서 스크립트 URL을 완성함.
	"""
	page_source = fetch_page_source(lecture_url)
	pattern = r'"text_tracks"\s:\s\[\s\{[^}]"url"\s:\s"([^"]+)"'
	match = re.search(pattern, page_source)
	if match:
	relative_url = match.group(1)
	script_url = "https://player.vimeo.com" + relative_url
	logging.debug(f"스크립트 URL 완성: {script_url}")
	return script_url
	else:
	logging.debug("페이지 소스에서 스크립트 태그 예시 부분을 찾지 못함")
	return ""

	def fetch_script(script_url):
	try:
	logging.debug(f"스크립트를 가져오는 중: {script_url}")
	response = requests.get(script_url)
	response.raise_for_status()
	logging.debug("스크립트를 성공적으로 가져옴")
	return response.text
	except Exception as e:
	logging.error(f"스크립트 가져오기 오류: {e}")
	return "오류 발생: " + str(e)

	def remove_timeline(script_text, lecture_number):
	"""
	[규칙수정]
	1. 번호, 시간(타임라인) 등을 제외한 내용만으로 작성할 것.
	2. 아래위 문장 사이 줄 간격이 없도록 모두 붙여 출력할 것.
	3. 마침표(.) 다음에는 반드시 여백 1칸이 있어야 하며, 여백 없이 내용이 이어진 경우 마침표 다음 여백을 추가할 것.
	4. 글 가장 앞에 있는 "WEBVTT"는 삭제할 것.
	5. 절대 내용을 줄이거나 요약하거나 바꾸지 말것.
	"""
	lines = script_text.splitlines()
	valid_lines = []
	for line in lines:
	stripped_line = line.strip()
	if stripped_line == "":
	continue
	if re.match(r'^\d+$', stripped_line):
	continue
	if re.match(r'^\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?\s-->\s\d{1,2}:\d{2}(?::\d{2}(?:\.\d{3})?)?$', stripped_line):
	continue
	valid_lines.append(stripped_line)
	cleaned_text = "".join(valid_lines)
	cleaned_text = re.sub(r'\.(\S)', r'. \1', cleaned_text)
	cleaned_text = re.sub(r'^WEBVTT\s*', '', cleaned_text)
	return cleaned_text

	# 새로운 전체 처리 함수: 강의 URL -> 스크립트 URL 생성 -> 스크립트 가져오기 -> 타임라인 제거
	def process_full(lecture_url):
	"""
	입력된 강의 URL로부터 스크립트 URL을 생성하고,
	해당 스크립트를 가져온 후 타임라인을 제거한 최종 강의 내용을 반환하는 함수.
	(중간 단계는 출력하지 않고 최종 결과만 반환)
	"""
	script_url = create_script_url(lecture_url)
	if not script_url:
	return "스크립트 URL 생성 실패"

	script_text = fetch_script(script_url)
	if "오류 발생" in script_text:
	return script_text

	cleaned = remove_timeline(script_text, 1)
	return cleaned
	# ---------- [모듈2: 추가코드] 끝 ----------

	# ---------- [모듈3: 강의 내용 합치기 추가코드] 시작 ----------
	with gr.Blocks() as merge_demo:
	gr.Markdown("## 강의 내용 합치기")
	# "강의내용 합치기" 버튼 삭제 (요청사항에 따라 삭제)
	merged_content = gr.Textbox(label="전체 강의 내용", lines=10, elem_id="merged_content")
	with gr.Row():
	merge_copy_btn = gr.Button("전체 강의 내용 복사하기", elem_id="merge_copy_btn")
	merge_copy_result = gr.Textbox(label="전체 강의 내용 복사 결과", interactive=False, elem_id="merge_copy_result")

	def merge_contents(l1, l2, l3):
	merged = ""
	if l1.strip():
	merged += "[강의1]\n" + l1.strip()
	if l2.strip():
	if merged:
	merged += "\n\n"
	merged += "[강의2]\n" + l2.strip()
	if l3.strip():
	if merged:
	merged += "\n\n"
	merged += "[강의3]\n" + l3.strip()
	return merged

	merge_copy_script = """
	<script>
	function setupMergeCopy(copyBtnId, textBoxId, resultBoxId) {
	const copyBtn = document.getElementById(copyBtnId);
	if (!copyBtn) {
	console.error("버튼 " + copyBtnId + "를 찾을 수 없습니다.");
	return;
	}
	copyBtn.addEventListener("click", function(){
	const textBoxElem = document.getElementById(textBoxId);
	const resultBoxElem = document.getElementById(resultBoxId);
	if(textBoxElem && resultBoxElem) {
	const textarea = textBoxElem.querySelector("textarea");
	const resultTextarea = resultBoxElem.querySelector("textarea");
	if(textarea && resultTextarea) {
	var text = textarea.value;
	if(text.trim() === ""){
	resultTextarea.value = "복사할 내용이 없습니다.";
	} else {
	navigator.clipboard.writeText(text).then(function(){
	resultTextarea.value = "복사완료";
	}, function(err){
	resultTextarea.value = "복사 실패";
	});
	}
	}
	}
	});
	}
	document.addEventListener("DOMContentLoaded", function(){
	setupMergeCopy("merge_copy_btn", "merged_content", "merge_copy_result");
	});
	</script>
	"""
	gr.HTML(merge_copy_script)
	# ---------- [모듈3: 강의 내용 합치기 추가코드] 끝 ----------

	# ---------- 새로운 기능: '선택한 섹션 강의 목록'에서 강의 URL 추출 및 강의 내용 자동 업데이트 기능 추가 ----------
	def merge_contents_global(l1, l2, l3):
	merged = ""
	if l1.strip():
	merged += "[강의1]\n" + l1.strip()
	if l2.strip():
	if merged:
	merged += "\n\n"
	merged += "[강의2]\n" + l2.strip()
	if l3.strip():
	if merged:
	merged += "\n\n"
	merged += "[강의3]\n" + l3.strip()
	return merged

	def handle_fetch_all(lecture_list_text):
	logging.debug("새로운 '강의 내용 가져오기' 버튼 클릭됨. 강의 목록 텍스트: %s", lecture_list_text)
	lines = lecture_list_text.splitlines()
	urls = []
	for line in lines:
	m = re.match(r"강의\d+\s:\s(.+)", line.strip())
	if m:
	url = m.group(1).strip()
	urls.append(url)
	logging.debug("추출된 URL: %s", url)
	urls = urls[:3]
	while len(urls) < 3:
	urls.append("")
	lec_contents = []
	for idx, url in enumerate(urls):
	if url:
	content = process_full(url)
	logging.debug("강의 %d 내용: %s", idx+1, content)
	else:
	content = ""
	logging.debug("강의 %d URL이 비어 있음.", idx+1)
	lec_contents.append(content)
	merged = merge_contents_global(lec_contents[0], lec_contents[1], lec_contents[2])
	logging.debug("전체 강의 내용 병합 완료.")
	return urls[0], urls[1], urls[2], lec_contents[0], lec_contents[1], lec_contents[2], merged
	# ---------- 새로운 기능 끝 ----------

	# ---------- 통합 Gradio 앱 구성 (한 페이지에 모두 표시) ----------
	with gr.Blocks() as additional_demo:
	gr.Markdown("## 강의 내용 가져오기")
	with gr.Row():
	url1 = gr.Textbox(label="강의1 URL", elem_id="url1")
	url2 = gr.Textbox(label="강의2 URL", elem_id="url2")
	url3 = gr.Textbox(label="강의3 URL", elem_id="url3")
	with gr.Row():
	pass
	with gr.Row():
	lecture_content1 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content1")
	lecture_content2 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content2")
	lecture_content3 = gr.Textbox(label="강의 내용", lines=10, elem_id="lecture_content3")
	with gr.Row():
	copy_btn1 = gr.Button("강의 내용 복사하기", elem_id="copy_btn1")
	copy_btn2 = gr.Button("강의 내용 복사하기", elem_id="copy_btn2")
	copy_btn3 = gr.Button("강의 내용 복사하기", elem_id="copy_btn3")
	with gr.Row():
	copy_result1 = gr.Textbox(label="강의 내용 복사하기 결과", interactive=False, elem_id="copy_result1")
	copy_result2 = gr.Textbox(label="강의 내용 복사하기 결과", interactive=False, elem_id="copy_result2")
	copy_result3 = gr.Textbox(label="강의 내용 복사하기 결과", interactive=False, elem_id="copy_result3")

	custom_script = """
	<script>
	function setupCopy(copyBtnId, textBoxId, resultBoxId) {
	const copyBtn = document.getElementById(copyBtnId);
	if (!copyBtn) {
	console.error("버튼 " + copyBtnId + "를 찾을 수 없습니다.");
	return;
	}
	copyBtn.addEventListener("click", function(){
	const textBoxElem = document.getElementById(textBoxId);
	const resultBoxElem = document.getElementById(resultBoxId);
	if(textBoxElem && resultBoxElem) {
	const textarea = textBoxElem.querySelector("textarea");
	const resultTextarea = resultBoxElem.querySelector("textarea");
	if(textarea && resultTextarea) {
	var text = textarea.value;
	if(text.trim() === ""){
	resultTextarea.value = "복사할 내용이 없습니다.";
	} else {
	navigator.clipboard.writeText(text).then(function(){
	resultTextarea.value = "복사완료";
	}, function(err){
	resultTextarea.value = "복사 실패";
	});
	}
	}
	}
	});
	}
	document.addEventListener("DOMContentLoaded", function(){
	setupCopy("copy_btn1", "lecture_content1", "copy_result1");
	setupCopy("copy_btn2", "lecture_content2", "copy_result2");
	setupCopy("copy_btn3", "lecture_content3", "copy_result3");
	});
	</script>
	"""
	gr.HTML(custom_script)

	with gr.Blocks() as app:
	gr.Markdown("# 캐롤라인대학 강의 추출기 Ver.2.2")
	gr.HTML(
	"""
	<div style="background-color: #f0f0f0; padding: 10px; margin-bottom: 20px;">
	<strong>사용방법</strong>
	<ol>
	<li>추출을 원하는 강의 페이지에서 "Ctrl + U"를 눌러 "페이지 소스 보기" 페이지를 엽니다.</li>
	<li>페이지 소스 보기 페이지의 소스 내용을 전체 복사 합니다. ("Ctrl+A" → "Ctrl+C")</li>
	<li>복사한 내용을 추출기의 "전체 페이지 HTML 입력"란에 붙여 넣고 "Submit" 버튼을 클릭합니다.</li>
	<li>오른쪽 창에서 원하는 섹션을 선택 후 "강의 내용 가져오기" 버튼을 클릭합니다.</li>
	<li>가져온 강의 내용 중에서 필요한 내용만 복사하여 사용 가능합니다.</li>
	</ol>
	</div>
	"""
	)
	with gr.Tab("HTML 파싱 및 섹션 선택"):
	with gr.Row():
	with gr.Column():
	html_input = gr.Textbox(label="전체 페이지 HTML 입력", lines=20, placeholder="HTML 코드를 입력하세요...")
	parse_btn = gr.Button("Submit")
	with gr.Column():
	subject_out = gr.Textbox(label="과목명", interactive=False)
	section_dropdown = gr.Dropdown(label="섹션 선택", choices=[], interactive=True)
	lecture_out = gr.Textbox(label="선택한 섹션 강의 목록", lines=10, interactive=False)
	with gr.Row():
	fetch_all_btn = gr.Button("강의 내용 가져오기", elem_id="fetch_all_btn")
	sections_state = gr.State()

	parse_btn.click(
	fn=process_html_sections,
	inputs=html_input,
	outputs=[subject_out, section_dropdown, sections_state]
	)
	section_dropdown.change(
	fn=update_lecture_text_only,
	inputs=[section_dropdown, sections_state],
	outputs=lecture_out
	)
	fetch_all_btn.click(
	fn=handle_fetch_all,
	inputs=lecture_out,
	outputs=[url1, url2, url3, lecture_content1, lecture_content2, lecture_content3, merged_content]
	)

	additional_demo.render()
	merge_demo.render()

	if __name__ == "__main__":
	logging.debug("통합 Gradio 앱 실행 중")
	app.launch(debug=True)