yt_linkedin_post / utils.py
T-K-O-H
1st commit is worst commit
d356ba7
raw
history blame contribute delete
897 Bytes
import re
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
def extract_video_id(url: str) -> str:
video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', url)
if video_id_match:
return video_id_match.group(1)
raise ValueError("Invalid YouTube URL")
def create_vector_store(text: str) -> Chroma:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=100
)
texts = text_splitter.split_text(text)
embeddings = OpenAIEmbeddings()
return Chroma.from_texts(texts, embeddings)
def get_transcript(video_id: str) -> str:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
return " ".join([entry["text"] for entry in transcript])