Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_huggingface import HuggingFaceEndpoint | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| from pytube import YouTube | |
| import os | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # Initialize Hugging Face LLM with Mistral-7B-Instruct | |
| llm = HuggingFaceEndpoint( | |
| repo_id="mistralai/Mistral-7B-Instruct-v0.3", # Model ID for Mistral-7B | |
| huggingfacehub_api_token=HF_TOKEN.strip(), | |
| temperature=0.7, | |
| max_new_tokens=500 | |
| ) | |
| embeddings = HuggingFaceEmbeddings() # Using all-mpnet-base-v2 by default | |
| # Function to fetch YouTube transcript text | |
| def fetch_youtube_transcript(video_url): | |
| try: | |
| yt = YouTube(video_url) | |
| captions = YouTubeTranscriptApi.get_transcript(yt.video_id, languages=['en']) | |
| transcript_text = '\n'.join([caption['text'] for caption in captions]) | |
| return transcript_text | |
| except Exception as e: | |
| return f"Error fetching YouTube transcript: {e}" | |
| # Function to split transcript into chunks | |
| def create_chunks(transcript_text): | |
| if transcript_text: | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=20, | |
| length_function=len | |
| ) | |
| chunks = text_splitter.create_documents([transcript_text]) | |
| return chunks | |
| else: | |
| return None | |
| # Function to embed and store chunks using Chroma | |
| def embed_store(chunks): | |
| persist_directory = 'youtube_embeddings' | |
| vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory) | |
| vectordb.persist() # Persist ChromaDB | |
| vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings) | |
| return vectordb | |
| # Function to retrieve relevant content based on a query | |
| def retriever(vectordb, prompt): | |
| docs = vectordb.similarity_search(prompt) | |
| if docs: | |
| text = docs[0].page_content | |
| return text | |
| else: | |
| return "No relevant documents found." | |
| # Generate response using Hugging Face LLM | |
| def get_llm_response(text, prompt): | |
| if text and prompt: | |
| input_prompt = f"Context: {text}\n\nQuestion: {prompt}\n\nAnswer:" | |
| response = llm(input_prompt) | |
| return response | |
| else: | |
| return "No video found or error occurred." | |
| # Gradio application | |
| def chat_with_video(video_url, prompt): | |
| if video_url and prompt: | |
| # Fetch YouTube transcript | |
| transcript_text = fetch_youtube_transcript(video_url) | |
| if not transcript_text: | |
| return "Failed to retrieve transcript." | |
| # Create chunks of the transcript | |
| chunks = create_chunks(transcript_text) | |
| if not chunks: | |
| return "Error splitting transcript into chunks." | |
| # Embed and store the chunks | |
| vectordb = embed_store(chunks) | |
| # Retrieve relevant text based on the prompt | |
| text = retriever(vectordb, prompt) | |
| # Get response from Hugging Face model | |
| answer = get_llm_response(text, prompt) | |
| return answer | |
| else: | |
| return "Please provide both a video URL and a question." | |
| # Define Gradio interface | |
| iface = gr.Interface( | |
| fn=chat_with_video, | |
| inputs=[ | |
| gr.Textbox(label="YouTube Video URL"), | |
| gr.Textbox(label="Ask any question about the YouTube Video") | |
| ], | |
| outputs="text", | |
| title="YouTube Video Q&A with Hugging Face", | |
| description="Ask questions about a YouTube video using embeddings and Hugging Face LLM." | |
| ) | |
| # Run the Gradio app | |
| if __name__ == "__main__": | |
| iface.launch() | |