Daksh0505 commited on
Commit
aa7e489
Β·
verified Β·
1 Parent(s): b0b5ef6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -90
app.py CHANGED
@@ -3,101 +3,83 @@ from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingF
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.vectorstores import FAISS
5
  from langchain.prompts import PromptTemplate
 
 
 
6
  import os
7
  import requests
8
 
 
9
  api_key = os.getenv("HF_API_KEY")
10
  RAPIDAPI_KEY = (os.getenv("RAPIDAPI_KEY") or "").strip()
11
  if not RAPIDAPI_KEY:
12
  st.error("❌ RAPIDAPI_KEY not set. Please add it in your environment variables.")
13
 
14
 
15
- # πŸ“Ό Transcript Fetcher using RapidAPI
16
  @st.cache_data
17
  def get_transcript(video_id, language_code="en"):
18
  url = "https://youtube-transcript3.p.rapidapi.com/api/transcript"
19
- querystring = {"videoId": video_id, "lang": language_code} # note videoId
20
  headers = {
21
  "x-rapidapi-key": RAPIDAPI_KEY,
22
  "x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
23
  }
24
-
25
  try:
26
  response = requests.get(url, headers=headers, params=querystring, timeout=10)
27
- st.write("Status Code:", response.status_code)
28
- st.write("Response JSON:", response.text)
29
-
30
  if response.status_code != 200:
31
  st.error(f"API Error: {response.status_code}")
32
  return None
33
-
34
  data = response.json()
35
-
36
- # Handle transcript properly
37
- if isinstance(data, dict) and data.get("success") and "transcript" in data:
38
- transcript_list = data["transcript"]
39
- return ' '.join([item.get('text', '') for item in transcript_list])
40
- elif isinstance(data, dict) and "message" in data:
41
- st.error(f"API returned message: {data['message']}")
42
- return None
43
  else:
44
  st.warning("Unexpected API response format")
45
  return None
46
-
47
  except Exception as e:
48
  st.error(f"Error: {str(e)}")
49
  return None
50
 
51
 
 
 
 
 
 
 
 
 
 
 
52
 
53
 
54
- # πŸ“Ό Get Available Languages (simplified - try common ones)
55
- def get_available_languages():
56
- return [
57
- ("en", "English"),
58
- ("es", "Spanish"),
59
- ("fr", "French"),
60
- ("de", "German"),
61
- ("hi", "Hindi"),
62
- ("zh", "Chinese"),
63
- ("ja", "Japanese"),
64
- ("ko", "Korean"),
65
- ("pt", "Portuguese"),
66
- ("ru", "Russian")
67
- ]
68
-
69
- # 🧠 Embedding Loader
70
  @st.cache_resource
71
- def load_embeddings():
72
- return HuggingFaceEmbeddings(
73
- model_name="intfloat/multilingual-e5-base",
74
- model_kwargs={"device": "cpu"}
 
 
 
 
 
75
  )
 
76
 
77
- # 🧱 Vector Store Builder
78
- @st.cache_data
79
- def create_vector_store(transcript):
80
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
81
- docs = splitter.create_documents([transcript])
82
- return FAISS.from_documents(docs, load_embeddings())
83
 
84
- # πŸ€– Model Builder
85
- # πŸ€– Model Builder (with free model option)
86
- def build_model(model_choice, temperature):
87
  if model_choice == "DeepSeek":
88
  repo_id = "deepseek-ai/DeepSeek-V3.2-Exp" # paid
 
 
89
  elif model_choice == "OpenAI":
90
- repo_id = "openai/gpt-oss-20b" # paid
 
 
91
  else:
92
- # Free Hugging Face model
93
- repo_id = "bigscience/bloom-560m" # free, smaller model
94
-
95
- llm = HuggingFaceEndpoint(
96
- repo_id=repo_id,
97
- huggingfacehub_api_token=api_key,
98
- task="text-generation"
99
- )
100
- return ChatHuggingFace(llm=llm, temperature=temperature)
101
 
102
 
103
  # 🧾 Prompt Template
@@ -108,59 +90,38 @@ prompt_template = PromptTemplate(
108
  "If the context does not mention the topic, say clearly: 'There is no mention of the topic in the video you provided.'\n"
109
  "Then, based on your own knowledge, try to answer the question.\n"
110
  "If both the context and your knowledge are insufficient, say: 'I don't know.'\n\n"
111
- "Keep the answer format neat, clean, and human-readable.\n\n"
112
  "Context:\n{context}\n\n"
113
  "Question:\n{question}"
114
  ),
115
  input_variables=["context", "question"]
116
  )
117
 
118
- # πŸš€ App UI
119
- st.title("πŸŽ₯ YouTube Transcript Chatbot")
120
-
121
- with st.sidebar:
122
- st.subheader("βš™οΈ API Setup")
123
- st.info("Using RapidAPI for transcripts")
124
- st.markdown("[Get your free API key](https://rapidapi.com/ytjar/api/youtube-transcript3)")
125
 
126
- video_id = st.text_input("YouTube Video ID", value="lv1_-RER4_I",
127
- help="Example: dQw4w9WgXcQ from youtube.com/watch?v=dQw4w9WgXcQ")
128
-
129
- langs = get_available_languages()
130
- lang_options = [f"{name} ({code})" for code, name in langs]
131
- selected_lang = st.selectbox("Transcript Language", lang_options)
132
- language_code = selected_lang.split("(")[-1].strip(")")
133
 
 
134
  query = st.text_area("Your Query", value="What is RAG?")
135
- model_choice = st.radio("Model to Use", ["DeepSeek", "OpenAI", "Free HF Model"])
136
- temperature = st.slider("Temperature", 0, 100, value=50)
137
 
138
  if st.button("πŸš€ Run Chatbot"):
139
  if not video_id or not query:
140
  st.warning("Please fill in all fields.")
141
  else:
142
  with st.spinner("Fetching transcript..."):
143
- transcript = get_transcript(video_id, language_code)
144
-
145
  if not transcript:
146
- st.error("Could not fetch transcript. Make sure the video ID is correct and has captions.")
147
  else:
148
  st.success(f"βœ… Transcript fetched! ({len(transcript)} characters)")
149
-
150
  with st.spinner("Generating response..."):
151
- retriever = create_vector_store(transcript).as_retriever(
152
- search_type="mmr",
153
- search_kwargs={"k": 5}
154
- )
155
  relevant_docs = retriever.invoke(query)
156
  context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
157
-
158
- prompt = prompt_template.invoke({
159
- "context": context_text,
160
- "question": query
161
- })
162
-
163
- model = build_model(model_choice, temperature / 100.0)
164
- response = model.invoke(prompt)
165
-
166
- st.text_area("Model Response", value=response.content, height=400)
 
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.vectorstores import FAISS
5
  from langchain.prompts import PromptTemplate
6
+ from langchain.llms import HuggingFacePipeline
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
8
+ import torch
9
  import os
10
  import requests
11
 
12
+ # Environment variables
13
  api_key = os.getenv("HF_API_KEY")
14
  RAPIDAPI_KEY = (os.getenv("RAPIDAPI_KEY") or "").strip()
15
  if not RAPIDAPI_KEY:
16
  st.error("❌ RAPIDAPI_KEY not set. Please add it in your environment variables.")
17
 
18
 
19
+ # πŸ“Ό Transcript Fetcher
20
  @st.cache_data
21
  def get_transcript(video_id, language_code="en"):
22
  url = "https://youtube-transcript3.p.rapidapi.com/api/transcript"
23
+ querystring = {"videoId": video_id, "lang": language_code}
24
  headers = {
25
  "x-rapidapi-key": RAPIDAPI_KEY,
26
  "x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
27
  }
 
28
  try:
29
  response = requests.get(url, headers=headers, params=querystring, timeout=10)
 
 
 
30
  if response.status_code != 200:
31
  st.error(f"API Error: {response.status_code}")
32
  return None
 
33
  data = response.json()
34
+ if data.get("success") and "transcript" in data:
35
+ return ' '.join([item.get('text', '') for item in data["transcript"]])
 
 
 
 
 
 
36
  else:
37
  st.warning("Unexpected API response format")
38
  return None
 
39
  except Exception as e:
40
  st.error(f"Error: {str(e)}")
41
  return None
42
 
43
 
44
+ # 🧱 Vector Store
45
+ @st.cache_data
46
+ def create_vector_store(transcript):
47
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
48
+ docs = splitter.create_documents([transcript])
49
+ embeddings = HuggingFaceEmbeddings(
50
+ model_name="intfloat/multilingual-e5-base",
51
+ model_kwargs={"device": "cpu"}
52
+ )
53
+ return FAISS.from_documents(docs, embeddings)
54
 
55
 
56
+ # πŸ€– Load Free BLOOM locally
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  @st.cache_resource
58
+ def load_bloom():
59
+ model_name = "bigscience/bloom-560m"
60
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
61
+ model = AutoModelForCausalLM.from_pretrained(model_name)
62
+ pipe = pipeline(
63
+ "text-generation",
64
+ model=model,
65
+ tokenizer=tokenizer,
66
+ device=0 if torch.cuda.is_available() else -1
67
  )
68
+ return HuggingFacePipeline(pipeline=pipe)
69
 
 
 
 
 
 
 
70
 
71
+ # 🧩 Build model (handles endpoints + free local model)
72
+ def build_model(model_choice, temperature=0.7):
 
73
  if model_choice == "DeepSeek":
74
  repo_id = "deepseek-ai/DeepSeek-V3.2-Exp" # paid
75
+ llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=api_key, task="text-generation")
76
+ return ChatHuggingFace(llm=llm, temperature=temperature)
77
  elif model_choice == "OpenAI":
78
+ repo_id = "openai/gpt-oss-20b" # paid
79
+ llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=api_key, task="text-generation")
80
+ return ChatHuggingFace(llm=llm, temperature=temperature)
81
  else:
82
+ return load_bloom() # free local BLOOM
 
 
 
 
 
 
 
 
83
 
84
 
85
  # 🧾 Prompt Template
 
90
  "If the context does not mention the topic, say clearly: 'There is no mention of the topic in the video you provided.'\n"
91
  "Then, based on your own knowledge, try to answer the question.\n"
92
  "If both the context and your knowledge are insufficient, say: 'I don't know.'\n\n"
 
93
  "Context:\n{context}\n\n"
94
  "Question:\n{question}"
95
  ),
96
  input_variables=["context", "question"]
97
  )
98
 
 
 
 
 
 
 
 
99
 
100
+ # πŸš€ Streamlit UI
101
+ st.title("πŸŽ₯ YouTube Transcript Chatbot (Hybrid: Free + Paid)")
 
 
 
 
 
102
 
103
+ video_id = st.text_input("YouTube Video ID", value="lv1_-RER4_I")
104
  query = st.text_area("Your Query", value="What is RAG?")
105
+ model_choice = st.radio("Model to Use", ["DeepSeek", "OpenAI", "Free BLOOM"])
106
+ temperature = st.slider("Temperature", 0, 100, value=50) / 100.0
107
 
108
  if st.button("πŸš€ Run Chatbot"):
109
  if not video_id or not query:
110
  st.warning("Please fill in all fields.")
111
  else:
112
  with st.spinner("Fetching transcript..."):
113
+ transcript = get_transcript(video_id)
 
114
  if not transcript:
115
+ st.error("Could not fetch transcript.")
116
  else:
117
  st.success(f"βœ… Transcript fetched! ({len(transcript)} characters)")
118
+
119
  with st.spinner("Generating response..."):
120
+ retriever = create_vector_store(transcript).as_retriever(search_type="mmr", search_kwargs={"k": 5})
 
 
 
121
  relevant_docs = retriever.invoke(query)
122
  context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
123
+ prompt = prompt_template.format(context=context_text, question=query)
124
+
125
+ model = build_model(model_choice, temperature)
126
+ response = model.invoke(prompt) if model_choice != "Free BLOOM" else model(prompt)
127
+ st.text_area("Model Response", value=response if isinstance(response, str) else response.content, height=400)