import re from youtube_transcript_api import YouTubeTranscriptApi from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import Chroma def extract_video_id(url: str) -> str: video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', url) if video_id_match: return video_id_match.group(1) raise ValueError("Invalid YouTube URL") def create_vector_store(text: str) -> Chroma: text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=100 ) texts = text_splitter.split_text(text) embeddings = OpenAIEmbeddings() return Chroma.from_texts(texts, embeddings) def get_transcript(video_id: str) -> str: transcript = YouTubeTranscriptApi.get_transcript(video_id) return " ".join([entry["text"] for entry in transcript])