Spaces:

pratikshahp
/

youtube-chatbot

Sleeping

App Files Files Community

youtube-chatbot / app.py

pratikshahp

Update app.py

19a11d0 verified over 1 year ago

raw

history blame contribute delete

3.77 kB

	import gradio as gr
	from langchain_community.vectorstores import Chroma
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_huggingface import HuggingFaceEndpoint
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from youtube_transcript_api import YouTubeTranscriptApi
	from pytube import YouTube
	import os
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()
	HF_TOKEN = os.getenv("HF_TOKEN")

	# Initialize Hugging Face LLM with Mistral-7B-Instruct
	llm = HuggingFaceEndpoint(
	repo_id="mistralai/Mistral-7B-Instruct-v0.3", # Model ID for Mistral-7B
	huggingfacehub_api_token=HF_TOKEN.strip(),
	temperature=0.7,
	max_new_tokens=500
	)
	embeddings = HuggingFaceEmbeddings() # Using all-mpnet-base-v2 by default

	# Function to fetch YouTube transcript text
	def fetch_youtube_transcript(video_url):
	try:
	yt = YouTube(video_url)
	captions = YouTubeTranscriptApi.get_transcript(yt.video_id, languages=['en'])
	transcript_text = '\n'.join([caption['text'] for caption in captions])
	return transcript_text
	except Exception as e:
	return f"Error fetching YouTube transcript: {e}"

	# Function to split transcript into chunks
	def create_chunks(transcript_text):
	if transcript_text:
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=20,
	length_function=len
	)
	chunks = text_splitter.create_documents([transcript_text])
	return chunks
	else:
	return None

	# Function to embed and store chunks using Chroma
	def embed_store(chunks):
	persist_directory = 'youtube_embeddings'
	vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
	vectordb.persist() # Persist ChromaDB
	vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
	return vectordb

	# Function to retrieve relevant content based on a query
	def retriever(vectordb, prompt):
	docs = vectordb.similarity_search(prompt)
	if docs:
	text = docs[0].page_content
	return text
	else:
	return "No relevant documents found."

	# Generate response using Hugging Face LLM
	def get_llm_response(text, prompt):
	if text and prompt:
	input_prompt = f"Context: {text}\n\nQuestion: {prompt}\n\nAnswer:"
	response = llm(input_prompt)
	return response
	else:
	return "No video found or error occurred."

	# Gradio application
	def chat_with_video(video_url, prompt):
	if video_url and prompt:
	# Fetch YouTube transcript
	transcript_text = fetch_youtube_transcript(video_url)
	if not transcript_text:
	return "Failed to retrieve transcript."

	# Create chunks of the transcript
	chunks = create_chunks(transcript_text)
	if not chunks:
	return "Error splitting transcript into chunks."

	# Embed and store the chunks
	vectordb = embed_store(chunks)

	# Retrieve relevant text based on the prompt
	text = retriever(vectordb, prompt)

	# Get response from Hugging Face model
	answer = get_llm_response(text, prompt)
	return answer
	else:
	return "Please provide both a video URL and a question."

	# Define Gradio interface
	iface = gr.Interface(
	fn=chat_with_video,
	inputs=[
	gr.Textbox(label="YouTube Video URL"),
	gr.Textbox(label="Ask any question about the YouTube Video")
	],
	outputs="text",
	title="YouTube Video Q&A with Hugging Face",
	description="Ask questions about a YouTube video using embeddings and Hugging Face LLM."
	)

	# Run the Gradio app
	if __name__ == "__main__":
	iface.launch()