Spaces:
Sleeping
Sleeping
| import re | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import Chroma | |
| def extract_video_id(url: str) -> str: | |
| video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', url) | |
| if video_id_match: | |
| return video_id_match.group(1) | |
| raise ValueError("Invalid YouTube URL") | |
| def create_vector_store(text: str) -> Chroma: | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=100 | |
| ) | |
| texts = text_splitter.split_text(text) | |
| embeddings = OpenAIEmbeddings() | |
| return Chroma.from_texts(texts, embeddings) | |
| def get_transcript(video_id: str) -> str: | |
| transcript = YouTubeTranscriptApi.get_transcript(video_id) | |
| return " ".join([entry["text"] for entry in transcript]) |