Spaces:

jonghhhh
/

youtube_segment_summary

Sleeping

App Files Files Community

jonghhhh commited on Jan 24

Commit

914398e

verified ·

1 Parent(s): 8ebd22f

Upload youtube_segmenter.py with huggingface_hub

Browse files

Files changed (1) hide show

youtube_segmenter.py +461 -0

youtube_segmenter.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import numpy as np
+import ruptures as rpt
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from youtube_transcript_api import YouTubeTranscriptApi as YTApi
+from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
+import re
+import torch
+import os
+import json
+from dotenv import load_dotenv
+import time
+import socket
+# .env 파일 로드
+load_dotenv()
+# DNS 설정 확인 (디버깅용)
+def check_dns():
+    """DNS 연결 확인"""
+    try:
+        socket.gethostbyname('www.youtube.com')
+        return True
+    except socket.gaierror:
+        return False
+def simple_sentence_split(text):
+    """
+    빠른 문장 분리 함수 (정규식 기반)
+    KSS보다 빠르며, 유튜브 자막에 충분히 효과적
+    """
+    # 문장 종결 부호 기준으로 분리
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    # 빈 문자열 제거 및 공백 정리
+    sentences = [s.strip() for s in sentences if s.strip()]
+    return sentences
+def extract_video_id(url):
+    """
+    유튜브 URL에서 비디오 ID를 추출합니다.
+    """
+    patterns = [
+        r'(?:youtube\.com/watch\?v=|youtu\.be/)([^&\n?#]+)',
+        r'youtube\.com/embed/([^&\n?#]+)',
+        r'youtube\.com/v/([^&\n?#]+)'
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    raise ValueError("유효한 유튜브 URL이 아닙니다.")
+def get_youtube_transcript(url, language='ko', max_retries=3):
+    """
+    유튜브 URL에서 자막을 가져옵니다.
+    2026년 최신 방식: list_transcripts를 사용하여 사용 가능한 자막을 찾고 fetch
+    Args:
+        url: 유튜브 URL
+        language: 자막 언어 (기본값: 'ko')
+        max_retries: 최대 재시도 횟수 (기본값: 3)
+    Returns:
+        tuple: (text, timeline_data)
+            - text: 전체 자막 텍스트
+            - timeline_data: 각 자막의 타임스탬프 정보 리스트
+    """
+    # DNS 확인
+    print("네트워크 연결 확인 중...")
+    if not check_dns():
+        print("경고: DNS 해석에 문제가 있습니다. 재시도 중...")
+        time.sleep(2)
+    for attempt in range(max_retries):
+        try:
+            video_id = extract_video_id(url)
+            print(f"비디오 ID: {video_id}")
+            # YouTubeTranscriptApi 인스턴스 생성
+            api = YTApi()
+            # 1. 사용 가능한 자막 리스트 확인
+            print(f"자막 목록 가져오는 중... (시도 {attempt + 1}/{max_retries})")
+            transcript_list = api.list(video_id)
+            # 2. 한국어 자막 우선 검색 (수동 작성 -> 자동 생성 순)
+            try:
+                transcript = transcript_list.find_transcript([language])
+                print(f"{language} 자막을 찾았습니다.")
+            except NoTranscriptFound:
+                # 한국어 자막이 없으면 영어를 가져와 한국어로 번역 요청
+                print(f"{language} 자막이 없어 영어 자막을 번역하여 가져옵니다.")
+                try:
+                    transcript = transcript_list.find_transcript(['en']).translate(language)
+                    print("영어 자막을 한국어로 번역했습니다.")
+                except:
+                    # 번역도 안되면 영어 그대로 사용
+                    transcript = transcript_list.find_transcript(['en'])
+                    print("영어 자막을 사용합니다.")
+            # 3. 자막 데이터 가져오기
+            # fetch()는 FetchedTranscript 객체를 반환하며, 이는 리스트처럼 순회 가능
+            fetched_transcript = transcript.fetch()
+            # 4. 텍스트 및 타임스탬프 추출
+            text_parts = []
+            timeline_data = []
+            for item in fetched_transcript:
+                text_parts.append(item.text)
+                timeline_data.append({
+                    'text': item.text,
+                    'start': item.start,
+                    'duration': item.duration
+                })
+            text = " ".join(text_parts)
+            print(f"자막 가져오기 성공! (총 {len(timeline_data)}개 항목)")
+            return text, timeline_data
+        except TranscriptsDisabled:
+            raise Exception("이 비디오는 자막이 비활성화되어 있습니다.")
+        except NoTranscriptFound:
+            raise Exception("이 비디오에서 사용 가능한 자막을 찾을 수 없습니다.")
+        except (ConnectionError, socket.gaierror, OSError) as e:
+            # 네트워크 오류는 재시도
+            if attempt < max_retries - 1:
+                wait_time = (attempt + 1) * 2
+                print(f"네트워크 오류 발생. {wait_time}초 후 재시도... ({str(e)})")
+                time.sleep(wait_time)
+                continue
+            else:
+                raise Exception(f"자막을 가져오는 중 네트워크 오류 발생 (최대 재시도 초과): {str(e)}")
+        except Exception as e:
+            # 다른 오류는 즉시 실패
+            raise Exception(f"자막을 가져오는 중 오류 발생: {str(e)}")
+    # 모든 재시도 실패
+    raise Exception(f"자막 가져오기 실패: 최대 재시도 횟수({max_retries})를 초과했습니다.")
+def segment_youtube_transcript(text, penalty=5.0, threshold=90, model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
+    """
+    유튜브 자막을 주제별로 분리하는 하이브리드 파이프라인
+    Args:
+        text: 분석할 텍스트
+        penalty: ruptures PELT의 페널티 값 (클수록 변화점이 적게 탐지됨)
+        threshold: 의미 유사도 백분위수 임계값 (클수록 더 세밀하게 분리)
+        model_name: 사용할 임베딩 모델
+    """
+    # 1단계: 문장 분리 (빠른 정규식 기반 분리)
+    print("1단계: 문장 분리 중...")
+    sentences = simple_sentence_split(text)
+    print(f"총 {len(sentences)}개의 문장이 감지되었습니다.")
+    if len(sentences) < 2:
+        print("문장이 너무 적어 분리할 수 없습니다.")
+        return [text]
+    # 2단계: 임베딩 (multilingual-MiniLM)
+    t_start_embed = time.time()
+    print("2단계: 문장 임베딩 생성 중...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"사용 중인 디바이스: {device}")
+    model = SentenceTransformer(model_name, device=device)
+    # GPU 사용 시 FP16으로 메모리 절약
+    if device == "cuda":
+        model.half()
+    # 배치 사이즈 조절하여 메모리 사용량 최적화
+    embeddings = model.encode(sentences, batch_size=32, show_progress_bar=True)
+    print(f"임베딩 생성 완료: {time.time() - t_start_embed:.2f}초")
+    # 3단계: 1차 분리 - 대주제 (ruptures PELT)
+    t_start_pelt = time.time()
+    print("3단계: ruptures PELT를 이용한 대주제 분리 중...")
+    algo = rpt.Pelt(model="rbf").fit(embeddings)
+    breakpoints = algo.predict(pen=penalty)
+    print(f"감지된 대주제 변화점: {len(breakpoints)-1}개")
+    print(f"PELT 처리 완료: {time.time() - t_start_pelt:.2f}초")
+    # 4단계: 2차 분리 - 소주제 (SemanticChunker 방식 적용)
+    t_start_sub = time.time()
+    print("4단계: 의미 유사도 기반 소주제 정밀 분할 중...")
+    final_segments = []
+    start_idx = 0
+    for bp in breakpoints:
+        segment_embeddings = embeddings[start_idx:bp]
+        segment_sentences = sentences[start_idx:bp]
+        if len(segment_sentences) > 1:
+            # 인접 문장 간 유사도 계산
+            sims = []
+            for i in range(len(segment_embeddings) - 1):
+                sim = cosine_similarity([segment_embeddings[i]], [segment_embeddings[i+1]])[0][0]
+                sims.append(sim)
+            # 하위 백분위수(threshold)를 기준으로 분리 지점 결정
+            sub_threshold = np.percentile(sims, 100 - threshold)
+            current_sub_chunk = [segment_sentences[0]]
+            for i, sim in enumerate(sims):
+                if sim < sub_threshold:
+                    final_segments.append(" ".join(current_sub_chunk))
+                    current_sub_chunk = [segment_sentences[i+1]]
+                else:
+                    current_sub_chunk.append(segment_sentences[i+1])
+            final_segments.append(" ".join(current_sub_chunk))
+        else:
+            final_segments.extend(segment_sentences)
+        start_idx = bp
+    print(f"최종 분할 완료: 총 {len(final_segments)}개의 세그먼트")
+    return final_segments
+def process_youtube_video(youtube_url, penalty=5.0, threshold=90, output_dir="."):
+    """
+    유튜브 비디오를 처리하여 세그먼트로 분리 (외부에서 호출 가능)
+    Args:
+        youtube_url: 유튜브 URL
+        penalty: ruptures PELT 페널티 값
+        threshold: 의미 유사도 임계값
+        output_dir: 출력 디렉토리
+    Returns:
+        str: 생성된 JSON 파일 경로
+    """
+    try:
+        # 자막 가져오기
+        print("자막을 가져오는 중...")
+        transcript_text, timeline_data = get_youtube_transcript(youtube_url)
+        print(f"자막 길이: {len(transcript_text)} 글자\n")
+        # 주제 분리 실행
+        print("주제 분리 시작...\n")
+        segments = segment_youtube_transcript(
+            transcript_text,
+            penalty=penalty,
+            threshold=threshold
+        )
+        # 타임스탬프 매핑 함수
+        def find_start_time(segment_text, timeline_data):
+            """
+            세그먼트 텍스트의 첫 부분과 일치하는 타임라인 항목을 찾아 시작 시간 반환
+            """
+            # 세그먼트의 첫 부분 추출
+            segment_start = segment_text.strip()[:100]  # 첫 100자
+            if not segment_start:
+                return 0.0
+            # 전체 타임라인 텍스트 생성 (검색용)
+            full_timeline_text = " ".join([item['text'] for item in timeline_data])
+            # 세그먼트 시작 부분이 전체 텍스트에서 어디에 위치하는지 찾기
+            try:
+                # 첫 30자로 검색 (너무 길면 매칭이 어려울 수 있음)
+                search_text = segment_start[:30].strip()
+                position = full_timeline_text.find(search_text)
+                if position == -1:
+                    # 못 찾으면 더 짧게 시도
+                    search_text = segment_start[:15].strip()
+                    position = full_timeline_text.find(search_text)
+                if position >= 0:
+                    # 해당 위치가 몇 번째 타임라인 항목에 해당하는지 찾기
+                    char_count = 0
+                    for item in timeline_data:
+                        if char_count + len(item['text']) + 1 > position:  # +1은 공백
+                            return item['start']
+                        char_count += len(item['text']) + 1
+            except Exception:
+                pass
+            # 찾지 못한 경우 0 반환
+            return 0.0
+        def format_time(seconds):
+            """
+            초를 HH:MM:SS 형식으로 변환
+            """
+            hours = int(seconds // 3600)
+            minutes = int((seconds % 3600) // 60)
+            secs = int(seconds % 60)
+            return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+        # JSON 형식으로 결과 구성 (flat key 구조)
+        result_data = []
+        for i, segment in enumerate(segments):
+            start_time = find_start_time(segment, timeline_data)
+            result_data.append({
+                "url": youtube_url,
+                "chunk_id": f"chunk_{i+1}",
+                "chunk_time": format_time(start_time),
+                "text": segment
+            })
+        # 비디오 ID를 파일명으로 사용
+        video_id = extract_video_id(youtube_url)
+        output_file = os.path.join(output_dir, f"{video_id}.json")
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(result_data, f, ensure_ascii=False, indent=2)
+        print(f"\n결과가 {output_file}에 저장되었습니다.")
+        print(f"총 {len(segments)}개의 세그먼트가 생성되었습니다.\n")
+        return output_file
+    except Exception as e:
+        raise Exception(f"처리 중 오류 발생: {str(e)}")
+def main():
+    """
+    메인 실행 함수
+    """
+    print("=== 유튜브 트랜스크립트 주제 분리 도구 ===\n")
+    # .env에서 API 키 확인
+    youtube_api_key = os.getenv('YOUTUBE_API_KEY')
+    if youtube_api_key:
+        print(f"YOUTUBE_API_KEY 로드 완료: {youtube_api_key[:10]}...\n")
+    else:
+        print("경고: YOUTUBE_API_KEY가 .env 파일에 없습니다.\n")
+    # 유튜브 URL 입력
+    youtube_url = input("유튜브 URL을 입력하세요: ").strip()
+    if not youtube_url:
+        print("URL이 입력되지 않았습니다.")
+        return
+    try:
+        # 자막 가져오기
+        print("\n자막을 가져오는 중...")
+        transcript_text, timeline_data = get_youtube_transcript(youtube_url)
+        print(f"자막 길이: {len(transcript_text)} 글자\n")
+        # 파라미터 설정
+        print("분석 파라미터 설정:")
+        print("Penalty: 대주제 분리 민감도 (작을수록 더 많은 주제로 분리, 권장: 3-7)")
+        penalty_input = input("Penalty 값 (기본값: 5.0): ").strip()
+        penalty = float(penalty_input) if penalty_input else 5.0
+        print("\nThreshold: 소주제 분리 민감도")
+        print("  - 낮을수록(70-80): 더 많은 세그먼트로 분리 (세밀한 분리)")
+        print("  - 중간(85-92): 균형잡힌 분리 (권장)")
+        print("  - 높을수록(95-98): 더 적은 세그먼트로 분리 (큰 덩어리)")
+        threshold_input = input("Threshold 값 (기본값: 90): ").strip()
+        threshold = int(threshold_input) if threshold_input else 90
+        # 주제 분리 실행
+        print("\n주제 분리 시작...\n")
+        segments = segment_youtube_transcript(
+            transcript_text,
+            penalty=penalty,
+            threshold=threshold
+        )
+        # 타임스탬프 매핑 함수
+        def find_start_time(segment_text, timeline_data):
+            """
+            세그먼트 텍스트의 첫 부분과 일치하는 타임라인 항목을 찾아 시작 시간 반환
+            """
+            # 세그먼트의 첫 부분 추출
+            segment_start = segment_text.strip()[:100]  # 첫 100자
+            if not segment_start:
+                return 0.0
+            # 전체 타임라인 텍스트 생성 (검색용)
+            full_timeline_text = " ".join([item['text'] for item in timeline_data])
+            # 세그먼트 시작 부분이 전체 텍스트���서 어디에 위치하는지 찾기
+            try:
+                # 첫 30자로 검색 (너무 길면 매칭이 어려울 수 있음)
+                search_text = segment_start[:30].strip()
+                position = full_timeline_text.find(search_text)
+                if position == -1:
+                    # 못 찾으면 더 짧게 시도
+                    search_text = segment_start[:15].strip()
+                    position = full_timeline_text.find(search_text)
+                if position >= 0:
+                    # 해당 위치가 몇 번째 타임라인 항목에 해당하는지 찾기
+                    char_count = 0
+                    for item in timeline_data:
+                        if char_count + len(item['text']) + 1 > position:  # +1은 공백
+                            return item['start']
+                        char_count += len(item['text']) + 1
+            except Exception:
+                pass
+            # 찾지 못한 경우 0 반환
+            return 0.0
+        def format_time(seconds):
+            """
+            초를 HH:MM:SS 형식으로 변환
+            """
+            hours = int(seconds // 3600)
+            minutes = int((seconds % 3600) // 60)
+            secs = int(seconds % 60)
+            return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+        # JSON 형식으로 결과 구성 (flat key 구조)
+        result_data = []
+        for i, segment in enumerate(segments):
+            start_time = find_start_time(segment, timeline_data)
+            result_data.append({
+                "url": youtube_url,
+                "chunk_id": f"chunk_{i+1}",
+                "chunk_time": format_time(start_time),
+                "text": segment
+            })
+        # 비디오 ID를 파일명으로 사용
+        video_id = extract_video_id(youtube_url)
+        output_file = f"{video_id}.json"
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(result_data, f, ensure_ascii=False, indent=2)
+        print(f"\n결과가 {output_file}에 저장되었습니다.")
+        print(f"총 {len(segments)}개의 세그먼트가 생성되었습니다.\n")
+        # 결과 미리보기
+        print("=" * 80)
+        print("결과 미리보기 (첫 3개 세그먼트)")
+        print("=" * 80)
+        for item in result_data[:3]:
+            print(f"\nChunk ID: {item['chunk_id']}")
+            print(f"URL: {item['url']}")
+            print(f"시작 시간: {item['chunk_time']}")
+            print(f"Text: {item['text'][:200]}..." if len(item['text']) > 200 else f"Text: {item['text']}")
+            print("-" * 80)
+    except Exception as e:
+        print(f"\n오류 발생: {str(e)}")
+if __name__ == "__main__":
+    main()