Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -93,8 +93,11 @@ def create_vector_store(transcript):
|
|
| 93 |
)
|
| 94 |
return FAISS.from_documents(docs, embeddings)
|
| 95 |
|
| 96 |
-
#
|
|
|
|
|
|
|
| 97 |
def build_model(model_choice, temperature=0.7):
|
|
|
|
| 98 |
if model_choice == "Flan-T5 (Free)":
|
| 99 |
llm = HuggingFaceEndpoint(
|
| 100 |
repo_id="google/flan-t5-base",
|
|
@@ -102,7 +105,8 @@ def build_model(model_choice, temperature=0.7):
|
|
| 102 |
max_new_tokens=500,
|
| 103 |
temperature=temperature
|
| 104 |
)
|
| 105 |
-
return
|
|
|
|
| 106 |
elif model_choice == "DeepSeek":
|
| 107 |
llm = HuggingFaceEndpoint(
|
| 108 |
repo_id="deepseek-ai/DeepSeek-V3.2-Exp",
|
|
@@ -110,7 +114,8 @@ def build_model(model_choice, temperature=0.7):
|
|
| 110 |
task="text-generation",
|
| 111 |
max_new_tokens=500
|
| 112 |
)
|
| 113 |
-
return ChatHuggingFace(llm=llm, temperature=temperature)
|
|
|
|
| 114 |
elif model_choice == "OpenAI":
|
| 115 |
llm = HuggingFaceEndpoint(
|
| 116 |
repo_id="openai/gpt-oss-20b",
|
|
@@ -118,9 +123,12 @@ def build_model(model_choice, temperature=0.7):
|
|
| 118 |
task="text-generation",
|
| 119 |
max_new_tokens=500
|
| 120 |
)
|
| 121 |
-
return ChatHuggingFace(llm=llm, temperature=temperature)
|
|
|
|
| 122 |
|
| 123 |
-
#
|
|
|
|
|
|
|
| 124 |
prompt_template = PromptTemplate(
|
| 125 |
template=(
|
| 126 |
"Answer the question based on the context below.\n\n"
|
|
@@ -131,8 +139,11 @@ prompt_template = PromptTemplate(
|
|
| 131 |
input_variables=["context", "question"]
|
| 132 |
)
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
video_id = st.text_input("YouTube Video ID", value="lv1_-RER4_I")
|
| 138 |
query = st.text_area("Your Query", value="What is RAG?")
|
|
@@ -144,38 +155,59 @@ language_code = None
|
|
| 144 |
if video_id:
|
| 145 |
with st.spinner("Checking available languages..."):
|
| 146 |
available_languages = get_available_languages(video_id)
|
| 147 |
-
|
| 148 |
if available_languages:
|
| 149 |
st.success(f"Found {len(available_languages)} language(s)")
|
| 150 |
lang_options = {label: code for code, label in available_languages}
|
| 151 |
selected_label = st.selectbox("Select Language", options=list(lang_options.keys()))
|
| 152 |
language_code = lang_options[selected_label]
|
| 153 |
else:
|
| 154 |
-
st.warning("No languages found")
|
|
|
|
| 155 |
|
|
|
|
|
|
|
|
|
|
| 156 |
if st.button("Run Chatbot"):
|
| 157 |
if not video_id or not query or not language_code:
|
| 158 |
-
st.warning("Please fill in all fields and select a language.")
|
| 159 |
else:
|
| 160 |
with st.spinner("Fetching transcript..."):
|
| 161 |
transcript = get_transcript(video_id, language_code)
|
| 162 |
-
|
| 163 |
if not transcript:
|
| 164 |
-
st.error("Could not fetch transcript.")
|
| 165 |
else:
|
| 166 |
-
st.success(f"Transcript fetched ({len(transcript)} characters).")
|
| 167 |
-
|
| 168 |
-
with st.spinner("
|
| 169 |
retriever = create_vector_store(transcript).as_retriever(
|
| 170 |
search_type="mmr",
|
| 171 |
search_kwargs={"k": 5}
|
| 172 |
)
|
| 173 |
relevant_docs = retriever.invoke(query)
|
| 174 |
context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
)
|
| 94 |
return FAISS.from_documents(docs, embeddings)
|
| 95 |
|
| 96 |
+
# -------------------------------------------------
|
| 97 |
+
# 3️⃣ Model Builder
|
| 98 |
+
# -------------------------------------------------
|
| 99 |
def build_model(model_choice, temperature=0.7):
|
| 100 |
+
"""Return the correct model and a flag indicating if it’s chat-based."""
|
| 101 |
if model_choice == "Flan-T5 (Free)":
|
| 102 |
llm = HuggingFaceEndpoint(
|
| 103 |
repo_id="google/flan-t5-base",
|
|
|
|
| 105 |
max_new_tokens=500,
|
| 106 |
temperature=temperature
|
| 107 |
)
|
| 108 |
+
return llm, False # (model, is_chat)
|
| 109 |
+
|
| 110 |
elif model_choice == "DeepSeek":
|
| 111 |
llm = HuggingFaceEndpoint(
|
| 112 |
repo_id="deepseek-ai/DeepSeek-V3.2-Exp",
|
|
|
|
| 114 |
task="text-generation",
|
| 115 |
max_new_tokens=500
|
| 116 |
)
|
| 117 |
+
return ChatHuggingFace(llm=llm, temperature=temperature), True
|
| 118 |
+
|
| 119 |
elif model_choice == "OpenAI":
|
| 120 |
llm = HuggingFaceEndpoint(
|
| 121 |
repo_id="openai/gpt-oss-20b",
|
|
|
|
| 123 |
task="text-generation",
|
| 124 |
max_new_tokens=500
|
| 125 |
)
|
| 126 |
+
return ChatHuggingFace(llm=llm, temperature=temperature), True
|
| 127 |
+
|
| 128 |
|
| 129 |
+
# -------------------------------------------------
|
| 130 |
+
# 4️⃣ Prompt Template
|
| 131 |
+
# -------------------------------------------------
|
| 132 |
prompt_template = PromptTemplate(
|
| 133 |
template=(
|
| 134 |
"Answer the question based on the context below.\n\n"
|
|
|
|
| 139 |
input_variables=["context", "question"]
|
| 140 |
)
|
| 141 |
|
| 142 |
+
|
| 143 |
+
# -------------------------------------------------
|
| 144 |
+
# 5️⃣ Streamlit App UI
|
| 145 |
+
# -------------------------------------------------
|
| 146 |
+
st.title("🎬 YouTube Transcript Chatbot (RAG)")
|
| 147 |
|
| 148 |
video_id = st.text_input("YouTube Video ID", value="lv1_-RER4_I")
|
| 149 |
query = st.text_area("Your Query", value="What is RAG?")
|
|
|
|
| 155 |
if video_id:
|
| 156 |
with st.spinner("Checking available languages..."):
|
| 157 |
available_languages = get_available_languages(video_id)
|
| 158 |
+
|
| 159 |
if available_languages:
|
| 160 |
st.success(f"Found {len(available_languages)} language(s)")
|
| 161 |
lang_options = {label: code for code, label in available_languages}
|
| 162 |
selected_label = st.selectbox("Select Language", options=list(lang_options.keys()))
|
| 163 |
language_code = lang_options[selected_label]
|
| 164 |
else:
|
| 165 |
+
st.warning("No languages found for this video.")
|
| 166 |
+
|
| 167 |
|
| 168 |
+
# -------------------------------------------------
|
| 169 |
+
# 6️⃣ Run Chatbot
|
| 170 |
+
# -------------------------------------------------
|
| 171 |
if st.button("Run Chatbot"):
|
| 172 |
if not video_id or not query or not language_code:
|
| 173 |
+
st.warning("⚠️ Please fill in all fields and select a language.")
|
| 174 |
else:
|
| 175 |
with st.spinner("Fetching transcript..."):
|
| 176 |
transcript = get_transcript(video_id, language_code)
|
| 177 |
+
|
| 178 |
if not transcript:
|
| 179 |
+
st.error("❌ Could not fetch transcript.")
|
| 180 |
else:
|
| 181 |
+
st.success(f"✅ Transcript fetched ({len(transcript)} characters).")
|
| 182 |
+
|
| 183 |
+
with st.spinner("Creating knowledge base..."):
|
| 184 |
retriever = create_vector_store(transcript).as_retriever(
|
| 185 |
search_type="mmr",
|
| 186 |
search_kwargs={"k": 5}
|
| 187 |
)
|
| 188 |
relevant_docs = retriever.invoke(query)
|
| 189 |
context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
|
| 190 |
+
|
| 191 |
+
prompt = prompt_template.format(context=context_text, question=query)
|
| 192 |
+
|
| 193 |
+
with st.spinner(f"Generating response using {model_choice}..."):
|
| 194 |
+
model, is_chat = build_model(model_choice, temperature)
|
| 195 |
+
|
| 196 |
+
try:
|
| 197 |
+
if is_chat:
|
| 198 |
+
# DeepSeek & OpenAI (chat-based)
|
| 199 |
+
response = model.invoke(prompt)
|
| 200 |
+
response_text = (
|
| 201 |
+
response.content if hasattr(response, "content") else str(response)
|
| 202 |
+
)
|
| 203 |
+
else:
|
| 204 |
+
# Flan-T5 (non-chat)
|
| 205 |
+
response = model(prompt)
|
| 206 |
+
if isinstance(response, list) and "generated_text" in response[0]:
|
| 207 |
+
response_text = response[0]["generated_text"]
|
| 208 |
+
else:
|
| 209 |
+
response_text = str(response)
|
| 210 |
+
|
| 211 |
+
st.text_area("🧠 Model Response", value=response_text, height=400)
|
| 212 |
+
except Exception as e:
|
| 213 |
+
st.error(f"Model generation failed: {e}")
|