emaaaa543 commited on
Commit
6453a05
·
verified ·
1 Parent(s): 707bcd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -19
app.py CHANGED
@@ -1,10 +1,11 @@
1
  import gradio as gr
2
- from langchain_community.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, WebBaseLoader, YoutubeLoader
3
  from langchain_text_splitters import RecursiveCharacterTextSplitter
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
  from langchain_chroma import Chroma
6
  from langchain_core.documents import Document
7
  from youtube_transcript_api import YouTubeTranscriptApi
 
 
8
  import tiktoken
9
  import os
10
  from dotenv import load_dotenv
@@ -37,18 +38,8 @@ vector_store = Chroma(
37
  # loader = YoutubeLoader.from_youtube_url("https://www.youtube.com/watch?v=e-gwvmhyU7A", add_video_info=True)
38
  # data = loader.load() # Assume this loads the transcript
39
 
40
- def load_youtube_data(video_id):
41
- try:
42
- transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
43
- data = "\n".join([entry['text'] for entry in transcript_data])
44
- return data
45
- except Exception as e:
46
- return str(e)
47
-
48
- video_id = "e-gwvmhyU7A" # Extract the video ID from the YouTube URL
49
- data = load_youtube_data(video_id)
50
- if isinstance(data, str): # If error occurred
51
- print(f"Error loading YouTube data: {data}")
52
 
53
 
54
 
@@ -73,14 +64,14 @@ text_splitter = RecursiveCharacterTextSplitter(
73
  texts = text_splitter.split_documents(data)
74
 
75
  # Store documents in ChromaDB
76
- documents = [
77
  Document(
78
  page_content=f"Source: {t.metadata['source']}, Title: {t.metadata['title']} \n\nContent: {t.page_content}",
79
- metadata=t.metadata
80
- )
81
- for t in texts
82
- ]
83
- vector_store.add_documents(documents=documents)
84
 
85
  # Define function to get embeddings from Hugging Face
86
  def get_embedding(text):
 
1
  import gradio as gr
 
2
  from langchain_text_splitters import RecursiveCharacterTextSplitter
3
  from langchain_huggingface import HuggingFaceEmbeddings
4
  from langchain_chroma import Chroma
5
  from langchain_core.documents import Document
6
  from youtube_transcript_api import YouTubeTranscriptApi
7
+ from langchain_community.document_loaders import YoutubeLoader
8
+ from langchain_community.document_loaders import GoogleApiYoutubeLoader
9
  import tiktoken
10
  import os
11
  from dotenv import load_dotenv
 
38
  # loader = YoutubeLoader.from_youtube_url("https://www.youtube.com/watch?v=e-gwvmhyU7A", add_video_info=True)
39
  # data = loader.load() # Assume this loads the transcript
40
 
41
+ loader = YoutubeLoader.from_youtube_url("https://www.youtube.com/watch?v=e-gwvmhyU7A", add_video_info=True)
42
+ data = loader.load()
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
 
 
64
  texts = text_splitter.split_documents(data)
65
 
66
  # Store documents in ChromaDB
67
+ documents= [
68
  Document(
69
  page_content=f"Source: {t.metadata['source']}, Title: {t.metadata['title']} \n\nContent: {t.page_content}",
70
+ metadata=t.metadata
71
+ )
72
+ for t in texts]
73
+
74
+ vectorstore_from_texts = vector_store.add_documents(documents=documents)
75
 
76
  # Define function to get embeddings from Hugging Face
77
  def get_embedding(text):