Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,101 +3,83 @@ from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingF
|
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
from langchain_community.vectorstores import FAISS
|
| 5 |
from langchain.prompts import PromptTemplate
|
|
|
|
|
|
|
|
|
|
| 6 |
import os
|
| 7 |
import requests
|
| 8 |
|
|
|
|
| 9 |
api_key = os.getenv("HF_API_KEY")
|
| 10 |
RAPIDAPI_KEY = (os.getenv("RAPIDAPI_KEY") or "").strip()
|
| 11 |
if not RAPIDAPI_KEY:
|
| 12 |
st.error("β RAPIDAPI_KEY not set. Please add it in your environment variables.")
|
| 13 |
|
| 14 |
|
| 15 |
-
# πΌ Transcript Fetcher
|
| 16 |
@st.cache_data
|
| 17 |
def get_transcript(video_id, language_code="en"):
|
| 18 |
url = "https://youtube-transcript3.p.rapidapi.com/api/transcript"
|
| 19 |
-
querystring = {"videoId": video_id, "lang": language_code}
|
| 20 |
headers = {
|
| 21 |
"x-rapidapi-key": RAPIDAPI_KEY,
|
| 22 |
"x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
|
| 23 |
}
|
| 24 |
-
|
| 25 |
try:
|
| 26 |
response = requests.get(url, headers=headers, params=querystring, timeout=10)
|
| 27 |
-
st.write("Status Code:", response.status_code)
|
| 28 |
-
st.write("Response JSON:", response.text)
|
| 29 |
-
|
| 30 |
if response.status_code != 200:
|
| 31 |
st.error(f"API Error: {response.status_code}")
|
| 32 |
return None
|
| 33 |
-
|
| 34 |
data = response.json()
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
if isinstance(data, dict) and data.get("success") and "transcript" in data:
|
| 38 |
-
transcript_list = data["transcript"]
|
| 39 |
-
return ' '.join([item.get('text', '') for item in transcript_list])
|
| 40 |
-
elif isinstance(data, dict) and "message" in data:
|
| 41 |
-
st.error(f"API returned message: {data['message']}")
|
| 42 |
-
return None
|
| 43 |
else:
|
| 44 |
st.warning("Unexpected API response format")
|
| 45 |
return None
|
| 46 |
-
|
| 47 |
except Exception as e:
|
| 48 |
st.error(f"Error: {str(e)}")
|
| 49 |
return None
|
| 50 |
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
-
#
|
| 55 |
-
def get_available_languages():
|
| 56 |
-
return [
|
| 57 |
-
("en", "English"),
|
| 58 |
-
("es", "Spanish"),
|
| 59 |
-
("fr", "French"),
|
| 60 |
-
("de", "German"),
|
| 61 |
-
("hi", "Hindi"),
|
| 62 |
-
("zh", "Chinese"),
|
| 63 |
-
("ja", "Japanese"),
|
| 64 |
-
("ko", "Korean"),
|
| 65 |
-
("pt", "Portuguese"),
|
| 66 |
-
("ru", "Russian")
|
| 67 |
-
]
|
| 68 |
-
|
| 69 |
-
# π§ Embedding Loader
|
| 70 |
@st.cache_resource
|
| 71 |
-
def
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
)
|
|
|
|
| 76 |
|
| 77 |
-
# π§± Vector Store Builder
|
| 78 |
-
@st.cache_data
|
| 79 |
-
def create_vector_store(transcript):
|
| 80 |
-
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 81 |
-
docs = splitter.create_documents([transcript])
|
| 82 |
-
return FAISS.from_documents(docs, load_embeddings())
|
| 83 |
|
| 84 |
-
#
|
| 85 |
-
|
| 86 |
-
def build_model(model_choice, temperature):
|
| 87 |
if model_choice == "DeepSeek":
|
| 88 |
repo_id = "deepseek-ai/DeepSeek-V3.2-Exp" # paid
|
|
|
|
|
|
|
| 89 |
elif model_choice == "OpenAI":
|
| 90 |
-
repo_id = "openai/gpt-oss-20b"
|
|
|
|
|
|
|
| 91 |
else:
|
| 92 |
-
#
|
| 93 |
-
repo_id = "bigscience/bloom-560m" # free, smaller model
|
| 94 |
-
|
| 95 |
-
llm = HuggingFaceEndpoint(
|
| 96 |
-
repo_id=repo_id,
|
| 97 |
-
huggingfacehub_api_token=api_key,
|
| 98 |
-
task="text-generation"
|
| 99 |
-
)
|
| 100 |
-
return ChatHuggingFace(llm=llm, temperature=temperature)
|
| 101 |
|
| 102 |
|
| 103 |
# π§Ύ Prompt Template
|
|
@@ -108,59 +90,38 @@ prompt_template = PromptTemplate(
|
|
| 108 |
"If the context does not mention the topic, say clearly: 'There is no mention of the topic in the video you provided.'\n"
|
| 109 |
"Then, based on your own knowledge, try to answer the question.\n"
|
| 110 |
"If both the context and your knowledge are insufficient, say: 'I don't know.'\n\n"
|
| 111 |
-
"Keep the answer format neat, clean, and human-readable.\n\n"
|
| 112 |
"Context:\n{context}\n\n"
|
| 113 |
"Question:\n{question}"
|
| 114 |
),
|
| 115 |
input_variables=["context", "question"]
|
| 116 |
)
|
| 117 |
|
| 118 |
-
# π App UI
|
| 119 |
-
st.title("π₯ YouTube Transcript Chatbot")
|
| 120 |
-
|
| 121 |
-
with st.sidebar:
|
| 122 |
-
st.subheader("βοΈ API Setup")
|
| 123 |
-
st.info("Using RapidAPI for transcripts")
|
| 124 |
-
st.markdown("[Get your free API key](https://rapidapi.com/ytjar/api/youtube-transcript3)")
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
langs = get_available_languages()
|
| 130 |
-
lang_options = [f"{name} ({code})" for code, name in langs]
|
| 131 |
-
selected_lang = st.selectbox("Transcript Language", lang_options)
|
| 132 |
-
language_code = selected_lang.split("(")[-1].strip(")")
|
| 133 |
|
|
|
|
| 134 |
query = st.text_area("Your Query", value="What is RAG?")
|
| 135 |
-
model_choice = st.radio("Model to Use", ["DeepSeek", "OpenAI", "Free
|
| 136 |
-
temperature = st.slider("Temperature", 0, 100, value=50)
|
| 137 |
|
| 138 |
if st.button("π Run Chatbot"):
|
| 139 |
if not video_id or not query:
|
| 140 |
st.warning("Please fill in all fields.")
|
| 141 |
else:
|
| 142 |
with st.spinner("Fetching transcript..."):
|
| 143 |
-
transcript = get_transcript(video_id
|
| 144 |
-
|
| 145 |
if not transcript:
|
| 146 |
-
st.error("Could not fetch transcript.
|
| 147 |
else:
|
| 148 |
st.success(f"β
Transcript fetched! ({len(transcript)} characters)")
|
| 149 |
-
|
| 150 |
with st.spinner("Generating response..."):
|
| 151 |
-
retriever = create_vector_store(transcript).as_retriever(
|
| 152 |
-
search_type="mmr",
|
| 153 |
-
search_kwargs={"k": 5}
|
| 154 |
-
)
|
| 155 |
relevant_docs = retriever.invoke(query)
|
| 156 |
context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
model = build_model(model_choice, temperature / 100.0)
|
| 164 |
-
response = model.invoke(prompt)
|
| 165 |
-
|
| 166 |
-
st.text_area("Model Response", value=response.content, height=400)
|
|
|
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
from langchain_community.vectorstores import FAISS
|
| 5 |
from langchain.prompts import PromptTemplate
|
| 6 |
+
from langchain.llms import HuggingFacePipeline
|
| 7 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 8 |
+
import torch
|
| 9 |
import os
|
| 10 |
import requests
|
| 11 |
|
| 12 |
+
# Environment variables
|
| 13 |
api_key = os.getenv("HF_API_KEY")
|
| 14 |
RAPIDAPI_KEY = (os.getenv("RAPIDAPI_KEY") or "").strip()
|
| 15 |
if not RAPIDAPI_KEY:
|
| 16 |
st.error("β RAPIDAPI_KEY not set. Please add it in your environment variables.")
|
| 17 |
|
| 18 |
|
| 19 |
+
# πΌ Transcript Fetcher
|
| 20 |
@st.cache_data
|
| 21 |
def get_transcript(video_id, language_code="en"):
|
| 22 |
url = "https://youtube-transcript3.p.rapidapi.com/api/transcript"
|
| 23 |
+
querystring = {"videoId": video_id, "lang": language_code}
|
| 24 |
headers = {
|
| 25 |
"x-rapidapi-key": RAPIDAPI_KEY,
|
| 26 |
"x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
|
| 27 |
}
|
|
|
|
| 28 |
try:
|
| 29 |
response = requests.get(url, headers=headers, params=querystring, timeout=10)
|
|
|
|
|
|
|
|
|
|
| 30 |
if response.status_code != 200:
|
| 31 |
st.error(f"API Error: {response.status_code}")
|
| 32 |
return None
|
|
|
|
| 33 |
data = response.json()
|
| 34 |
+
if data.get("success") and "transcript" in data:
|
| 35 |
+
return ' '.join([item.get('text', '') for item in data["transcript"]])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
else:
|
| 37 |
st.warning("Unexpected API response format")
|
| 38 |
return None
|
|
|
|
| 39 |
except Exception as e:
|
| 40 |
st.error(f"Error: {str(e)}")
|
| 41 |
return None
|
| 42 |
|
| 43 |
|
| 44 |
+
# π§± Vector Store
|
| 45 |
+
@st.cache_data
|
| 46 |
+
def create_vector_store(transcript):
|
| 47 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 48 |
+
docs = splitter.create_documents([transcript])
|
| 49 |
+
embeddings = HuggingFaceEmbeddings(
|
| 50 |
+
model_name="intfloat/multilingual-e5-base",
|
| 51 |
+
model_kwargs={"device": "cpu"}
|
| 52 |
+
)
|
| 53 |
+
return FAISS.from_documents(docs, embeddings)
|
| 54 |
|
| 55 |
|
| 56 |
+
# π€ Load Free BLOOM locally
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
@st.cache_resource
|
| 58 |
+
def load_bloom():
|
| 59 |
+
model_name = "bigscience/bloom-560m"
|
| 60 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 61 |
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 62 |
+
pipe = pipeline(
|
| 63 |
+
"text-generation",
|
| 64 |
+
model=model,
|
| 65 |
+
tokenizer=tokenizer,
|
| 66 |
+
device=0 if torch.cuda.is_available() else -1
|
| 67 |
)
|
| 68 |
+
return HuggingFacePipeline(pipeline=pipe)
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
# π§© Build model (handles endpoints + free local model)
|
| 72 |
+
def build_model(model_choice, temperature=0.7):
|
|
|
|
| 73 |
if model_choice == "DeepSeek":
|
| 74 |
repo_id = "deepseek-ai/DeepSeek-V3.2-Exp" # paid
|
| 75 |
+
llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=api_key, task="text-generation")
|
| 76 |
+
return ChatHuggingFace(llm=llm, temperature=temperature)
|
| 77 |
elif model_choice == "OpenAI":
|
| 78 |
+
repo_id = "openai/gpt-oss-20b" # paid
|
| 79 |
+
llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=api_key, task="text-generation")
|
| 80 |
+
return ChatHuggingFace(llm=llm, temperature=temperature)
|
| 81 |
else:
|
| 82 |
+
return load_bloom() # free local BLOOM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
# π§Ύ Prompt Template
|
|
|
|
| 90 |
"If the context does not mention the topic, say clearly: 'There is no mention of the topic in the video you provided.'\n"
|
| 91 |
"Then, based on your own knowledge, try to answer the question.\n"
|
| 92 |
"If both the context and your knowledge are insufficient, say: 'I don't know.'\n\n"
|
|
|
|
| 93 |
"Context:\n{context}\n\n"
|
| 94 |
"Question:\n{question}"
|
| 95 |
),
|
| 96 |
input_variables=["context", "question"]
|
| 97 |
)
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
+
# π Streamlit UI
|
| 101 |
+
st.title("π₯ YouTube Transcript Chatbot (Hybrid: Free + Paid)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
video_id = st.text_input("YouTube Video ID", value="lv1_-RER4_I")
|
| 104 |
query = st.text_area("Your Query", value="What is RAG?")
|
| 105 |
+
model_choice = st.radio("Model to Use", ["DeepSeek", "OpenAI", "Free BLOOM"])
|
| 106 |
+
temperature = st.slider("Temperature", 0, 100, value=50) / 100.0
|
| 107 |
|
| 108 |
if st.button("π Run Chatbot"):
|
| 109 |
if not video_id or not query:
|
| 110 |
st.warning("Please fill in all fields.")
|
| 111 |
else:
|
| 112 |
with st.spinner("Fetching transcript..."):
|
| 113 |
+
transcript = get_transcript(video_id)
|
|
|
|
| 114 |
if not transcript:
|
| 115 |
+
st.error("Could not fetch transcript.")
|
| 116 |
else:
|
| 117 |
st.success(f"β
Transcript fetched! ({len(transcript)} characters)")
|
| 118 |
+
|
| 119 |
with st.spinner("Generating response..."):
|
| 120 |
+
retriever = create_vector_store(transcript).as_retriever(search_type="mmr", search_kwargs={"k": 5})
|
|
|
|
|
|
|
|
|
|
| 121 |
relevant_docs = retriever.invoke(query)
|
| 122 |
context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
|
| 123 |
+
prompt = prompt_template.format(context=context_text, question=query)
|
| 124 |
+
|
| 125 |
+
model = build_model(model_choice, temperature)
|
| 126 |
+
response = model.invoke(prompt) if model_choice != "Free BLOOM" else model(prompt)
|
| 127 |
+
st.text_area("Model Response", value=response if isinstance(response, str) else response.content, height=400)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|