Spaces:

emaaaa543
/

testing-space

Runtime error

App Files Files Community

emaaaa543 commited on Aug 15, 2024

Commit

babad83

verified ·

1 Parent(s): 91906c1

Create app.py

Browse files

Files changed (1) hide show

app.py +124 -0

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import gradio as gr
+from langchain_community.document_loaders import YoutubeLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+from langchain_core.documents import Document
+import tiktoken
+import os
+from dotenv import load_dotenv
+import json
+# Load environment variables
+load_dotenv()
+groq_api_key = os.getenv("GROQ_API_KEY")
+# Initialize Hugging Face embeddings
+hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# Initialize ChromaDB vector store
+vector_store = Chroma(
+    collection_name="data_collection",
+    embedding_function=hf_embeddings,
+)
+# Define function to split transcripts into chunks
+def split_transcript(transcript, max_chunk_size=10000):
+    chunks = []
+    current_chunk = ""
+    for line in transcript.split("\n"):
+        if len(current_chunk) + len(line) > max_chunk_size:
+            chunks.append(current_chunk)
+            current_chunk = line
+        else:
+            current_chunk += "\n" + line
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+# Load and process YouTube video
+loader = YoutubeLoader.from_youtube_url("https://youtu.be/9UTQd3Oo6Kw?si=xJ9rM3gK4ERTH9c5", add_video_info=True)
+transcript = loader.load()  # Assume this loads the transcript
+data = split_transcript(transcript)
+tokenizer = tiktoken.get_encoding('p50k_base')
+def tiktoken_len(text):
+    tokens = tokenizer.encode(
+        text,
+        disallowed_special=()
+    )
+    return len(tokens)
+# Initialize text splitter
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=2000,
+    chunk_overlap=100,
+    length_function=tiktoken_len,
+    separators=["\n\n", "\n", " ", ""]
+)
+# Split data from YouTube video
+texts = text_splitter.split_documents(data)
+# Store documents in ChromaDB
+documents = [
+    Document(
+        page_content=f"Source: {t.metadata['source']}, Title: {t.metadata['title']} \n\nContent: {t.page_content}",
+        metadata=t.metadata
+    )
+    for t in texts
+]
+vector_store.add_documents(documents=documents)
+# Define function to get embeddings from Hugging Face
+def get_embedding(text):
+    return hf_embeddings.embed_query(text)
+# Define Gradio interface function
+def query_model(user_input):
+    try:
+        # Call the function for user query vector embeddings
+        raw_query_embedding = get_embedding(user_input)
+        # Perform similarity search with vector store
+        results = vector_store.similarity_search_by_vector(
+            embedding=raw_query_embedding, k=1
+        )
+        contexts = [doc.page_content for doc in results]
+        # Prepare context for RAG
+        augmented_query = (
+            "<CONTEXT>\n" +
+            "\n\n-------\n\n".join(contexts) +
+            "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" +
+            user_input
+        )
+        # Call to Groq or Hugging Face model for completion
+        response = client.chat.completions.create(
+            model="llama3-8b-8192",
+            messages=[
+            {"role": "system", "content": primer},
+            {"role": "user", "content": augmented_query},
+            ],
+            max_tokens=1000,
+            temperature=1.2)
+         return {'assistantMessage':response.choices[0].message.content}
+    except Exception as e:
+        return str(e)
+# Create Gradio interface
+iface = gr.Interface(
+    fn=query_model,
+    inputs="text",
+    outputs="text",
+    title="RAG Model",
+    description="Retrieve and Generate responses from a YouTube video transcript."
+)
+if __name__ == "__main__":
+    iface.launch()