Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_chroma import Chroma | |
| from langchain_core.documents import Document | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| from langchain_community.document_loaders import YoutubeLoader | |
| from langchain_community.document_loaders import GoogleApiYoutubeLoader | |
| import tiktoken | |
| import os | |
| from dotenv import load_dotenv | |
| import json | |
| from groq import Groq | |
| from pydantic import BaseModel | |
| from typing import List | |
| # Load environment variables | |
| load_dotenv() | |
| groq_api_key = os.getenv("GROQ_API_KEY") | |
| os.environ["USER_AGENT"] = "RAG-chat-app" | |
| client = Groq(api_key=groq_api_key) | |
| primer = f"""You are a personal assistant. Answer any questions I have about the Youtube Video provided. | |
| Translate in specific language if user asks you to | |
| """ | |
| # Initialize Hugging Face embeddings | |
| hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| # Initialize ChromaDB vector store | |
| vector_store = Chroma( | |
| collection_name="data_collection", | |
| embedding_function=hf_embeddings, | |
| ) | |
| # # Load and process YouTube video | |
| # loader = YoutubeLoader.from_youtube_url("https://www.youtube.com/watch?v=e-gwvmhyU7A", add_video_info=True) | |
| # data = loader.load() # Assume this loads the transcript | |
| loader = YoutubeLoader.from_youtube_url("https://www.youtube.com/watch?v=e-gwvmhyU7A", add_video_info=True) | |
| data = loader.load() | |
| tokenizer = tiktoken.get_encoding('p50k_base') | |
| def tiktoken_len(text): | |
| tokens = tokenizer.encode( | |
| text, | |
| disallowed_special=() | |
| ) | |
| return len(tokens) | |
| # Initialize text splitter | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=2000, | |
| chunk_overlap=100, | |
| length_function=tiktoken_len, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| # Split data from YouTube video | |
| texts = text_splitter.split_documents(data) | |
| # Store documents in ChromaDB | |
| documents= [ | |
| Document( | |
| page_content=f"Source: {t.metadata['source']}, Title: {t.metadata['title']} \n\nContent: {t.page_content}", | |
| metadata=t.metadata | |
| ) | |
| for t in texts] | |
| vectorstore_from_texts = vector_store.add_documents(documents=documents) | |
| # Define function to get embeddings from Hugging Face | |
| def get_embedding(text): | |
| return hf_embeddings.embed_query(text) | |
| # Define Gradio interface function | |
| def query_model(messages): | |
| try: | |
| # Call the function for user query vector embeddings | |
| if isinstance(messages, list) and len(messages) > 0: | |
| latest_message = messages[-1]['content'] | |
| else: | |
| return "No messages provided or invalid format." | |
| raw_query_embedding= get_embedding(latest_message) | |
| # Perform similarity search with vector store | |
| results = vector_store.similarity_search_by_vector( | |
| embedding=raw_query_embedding, k=1 | |
| ) | |
| contexts = [doc.page_content for doc in results] | |
| # Prepare context for RAG | |
| augmented_query = ( | |
| "<CONTEXT>\n" + | |
| "\n\n-------\n\n".join(contexts) + | |
| "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + | |
| messages | |
| ) | |
| # Call to Groq or Hugging Face model for completion | |
| response = client.chat.completions.create( | |
| model="llama3-8b-8192", | |
| messages=[ | |
| {"role": "system", "content": primer}, | |
| {"role": "user", "content": augmented_query}, | |
| ], | |
| max_tokens=1000, | |
| temperature=1.2) | |
| return {'assistantMessage':response.choices[0].message.content} | |
| except Exception as e: | |
| return str(e) | |
| # Create Gradio interface | |
| iface = gr.Interface( | |
| fn=query_model, | |
| inputs=gr.JSON(label="Enter array of messages (JSON format)"), | |
| outputs=gr.Textbox(label="Response"), | |
| title="RAG Model", | |
| description="Retrieve and Generate responses from a YouTube video transcript." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |