emaaaa543 commited on
Commit
babad83
·
verified ·
1 Parent(s): 91906c1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -0
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain_community.document_loaders import YoutubeLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_chroma import Chroma
6
+ from langchain_core.documents import Document
7
+ import tiktoken
8
+ import os
9
+ from dotenv import load_dotenv
10
+ import json
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+ groq_api_key = os.getenv("GROQ_API_KEY")
15
+
16
+ # Initialize Hugging Face embeddings
17
+ hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
18
+
19
+ # Initialize ChromaDB vector store
20
+ vector_store = Chroma(
21
+ collection_name="data_collection",
22
+ embedding_function=hf_embeddings,
23
+ )
24
+
25
+ # Define function to split transcripts into chunks
26
+ def split_transcript(transcript, max_chunk_size=10000):
27
+ chunks = []
28
+ current_chunk = ""
29
+ for line in transcript.split("\n"):
30
+ if len(current_chunk) + len(line) > max_chunk_size:
31
+ chunks.append(current_chunk)
32
+ current_chunk = line
33
+ else:
34
+ current_chunk += "\n" + line
35
+ if current_chunk:
36
+ chunks.append(current_chunk)
37
+ return chunks
38
+
39
+ # Load and process YouTube video
40
+ loader = YoutubeLoader.from_youtube_url("https://youtu.be/9UTQd3Oo6Kw?si=xJ9rM3gK4ERTH9c5", add_video_info=True)
41
+ transcript = loader.load() # Assume this loads the transcript
42
+ data = split_transcript(transcript)
43
+
44
+ tokenizer = tiktoken.get_encoding('p50k_base')
45
+
46
+ def tiktoken_len(text):
47
+ tokens = tokenizer.encode(
48
+ text,
49
+ disallowed_special=()
50
+ )
51
+ return len(tokens)
52
+
53
+ # Initialize text splitter
54
+ text_splitter = RecursiveCharacterTextSplitter(
55
+ chunk_size=2000,
56
+ chunk_overlap=100,
57
+ length_function=tiktoken_len,
58
+ separators=["\n\n", "\n", " ", ""]
59
+ )
60
+
61
+ # Split data from YouTube video
62
+ texts = text_splitter.split_documents(data)
63
+
64
+ # Store documents in ChromaDB
65
+ documents = [
66
+ Document(
67
+ page_content=f"Source: {t.metadata['source']}, Title: {t.metadata['title']} \n\nContent: {t.page_content}",
68
+ metadata=t.metadata
69
+ )
70
+ for t in texts
71
+ ]
72
+ vector_store.add_documents(documents=documents)
73
+
74
+ # Define function to get embeddings from Hugging Face
75
+ def get_embedding(text):
76
+ return hf_embeddings.embed_query(text)
77
+
78
+ # Define Gradio interface function
79
+ def query_model(user_input):
80
+ try:
81
+ # Call the function for user query vector embeddings
82
+ raw_query_embedding = get_embedding(user_input)
83
+
84
+ # Perform similarity search with vector store
85
+ results = vector_store.similarity_search_by_vector(
86
+ embedding=raw_query_embedding, k=1
87
+ )
88
+
89
+ contexts = [doc.page_content for doc in results]
90
+
91
+ # Prepare context for RAG
92
+ augmented_query = (
93
+ "<CONTEXT>\n" +
94
+ "\n\n-------\n\n".join(contexts) +
95
+ "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" +
96
+ user_input
97
+ )
98
+
99
+ # Call to Groq or Hugging Face model for completion
100
+ response = client.chat.completions.create(
101
+ model="llama3-8b-8192",
102
+ messages=[
103
+ {"role": "system", "content": primer},
104
+ {"role": "user", "content": augmented_query},
105
+ ],
106
+ max_tokens=1000,
107
+ temperature=1.2)
108
+
109
+ return {'assistantMessage':response.choices[0].message.content}
110
+
111
+ except Exception as e:
112
+ return str(e)
113
+
114
+ # Create Gradio interface
115
+ iface = gr.Interface(
116
+ fn=query_model,
117
+ inputs="text",
118
+ outputs="text",
119
+ title="RAG Model",
120
+ description="Retrieve and Generate responses from a YouTube video transcript."
121
+ )
122
+
123
+ if __name__ == "__main__":
124
+ iface.launch()