import os import re import pickle import webvtt import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer # 1) Video ID video_id_map = { 'machine learning what is machine learning introduction to machine learning 2024 simplilearn': 'ukzFI9rgwfU', 'introduction to artificial intelligence what is ai artificial intelligence tutorial simplilearn': 'SSE4M0gcmvE', 'what is deep learning introduction to deep learning deep learning tutorial simplilearn': 'FbxTVRfQFuI' } # 2) File list videos = [ ( 'Machine Learning What Is Machine Learning Introduction To Machine Learning 2024 Simplilearn', 'Machine Learning.vtt' ), ( 'Introduction To Artificial Intelligence What Is AI Artificial Intelligence Tutorial Simplilearn', 'Artificial Intelligence.vtt' ), ( 'What is Deep Learning Introduction to Deep Learning Deep Learning Tutorial Simplilearn', 'Deep Learning.vtt' ) ] _space_re = re.compile(r'\s+') def normalize_minimal(text: str) -> str: if not isinstance(text, str): return '' t = text.lower() t = _space_re.sub(' ', t).strip() return t # DF (with YT ID) all_subtitles = [] for raw_title, file_path in videos: cleaned_title = re.sub(r'\s+', ' ', raw_title).strip().lower() youtube_id = video_id_map.get(cleaned_title, '') if not youtube_id: print(f"⚠️ YouTube ID not found for title: {cleaned_title}") continue for caption in webvtt.read(file_path): start = caption.start end = caption.end text_raw = caption.text or '' all_subtitles.append([start, end, text_raw, cleaned_title, youtube_id]) # DF with RAW text df = pd.DataFrame( all_subtitles, columns=['Start Time', 'End Time', 'Subtitle Text Raw', 'Video Title', 'YouTube ID'] ) # CLEAN column for indexing df['Subtitle Text Clean'] = df['Subtitle Text Raw'].apply(normalize_minimal) df['Subtitle Text'] = df['Subtitle Text Clean'] before_drop = len(df) df = df[df['Subtitle Text Clean'].str.len() > 0].copy() after_drop = len(df) df.sort_values(by=['YouTube ID', 'Start Time'], inplace=True, ignore_index=True) df.to_csv('cleaned_subtitles.csv', index=False) # 5) TF-IDF fit vectorizer = TfidfVectorizer( ngram_range=(1, 2), sublinear_tf=True, lowercase=True, min_df=1 ) tfidf_matrix = vectorizer.fit_transform(df['Subtitle Text Clean']) with open('tfidf_vectorizer.pkl', 'wb') as f: pickle.dump(vectorizer, f) with open('tfidf_matrix.pkl', 'wb') as f: pickle.dump(tfidf_matrix, f) # 6) Export metadata metadata = df[['Subtitle Text Raw', 'Subtitle Text Clean', 'Subtitle Text', 'Start Time', 'End Time', 'Video Title', 'YouTube ID']] metadata.to_csv('indexed_metadata.csv', index=False) print(" Files saved.") print(f" Rows read: {len(all_subtitles)}") print(f" Rows after clean: {after_drop} (dropped {before_drop - after_drop} empty rows)") print(f" Unique videos: {df['YouTube ID'].nunique()}") print(f" TF-IDF shape: {tfidf_matrix.shape}")