Vikrant-Honbute commited on
Commit
4f0c12e
·
1 Parent(s): a14c409

huggingface space issue fix

Browse files
Files changed (2) hide show
  1. app.py +235 -27
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,7 +1,12 @@
1
  import os
 
 
 
 
2
  from urllib.parse import parse_qs, urlparse
3
 
4
  import certifi
 
5
  import streamlit as st
6
  import validators
7
  from langchain_community.document_loaders import UnstructuredURLLoader
@@ -23,6 +28,7 @@ st.subheader("Summarize URL")
23
  # Sidebar
24
  with st.sidebar:
25
  groq_api_key = st.text_input("Groq API Key", value="", type="password")
 
26
 
27
  generic_url = st.text_input("URL", label_visibility="collapsed")
28
 
@@ -32,6 +38,9 @@ Content: {text}
32
  """
33
  prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
34
 
 
 
 
35
 
36
  def is_youtube_url(url: str) -> bool:
37
  return "youtube.com" in url or "youtu.be" in url
@@ -70,45 +79,244 @@ def build_web_loader(url: str) -> UnstructuredURLLoader:
70
  )
71
 
72
 
73
- def load_youtube_docs(url: str) -> list[Document]:
74
- video_id = extract_video_id(url)
75
- if not video_id:
76
- raise ValueError("Could not parse YouTube video id from URL.")
77
 
78
- transcript = None
79
  try:
80
- transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en", "en-US"])
81
- except Exception:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  try:
83
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
 
 
84
  except Exception:
85
- st.warning(
86
- "YouTube transcript fetch failed (often network/IP restriction). "
87
- "Trying video metadata fallback..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- if transcript:
91
- transcript_text = " ".join(item.get("text", "") for item in transcript).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  if transcript_text:
93
  return [
94
  Document(
95
  page_content=transcript_text,
96
- metadata={"source": url, "kind": "transcript"},
97
  )
98
  ]
 
99
 
100
- try:
101
- with YoutubeDL(
102
- {
103
- "quiet": True,
104
- "skip_download": True,
105
- "noplaylist": True,
106
- "socket_timeout": 20,
107
- }
108
- ) as ydl:
109
- video_info = ydl.extract_info(url, download=False)
110
- except Exception as exc:
111
- raise RuntimeError(f"YouTube transcript and metadata fetch both failed: {exc}") from exc
 
 
 
 
 
 
 
112
 
113
  title = (video_info.get("title") or "").strip()
114
  description = (video_info.get("description") or "").strip()
@@ -142,7 +350,7 @@ if st.button("Summarize the Content from YT or Website"):
142
  is_youtube = is_youtube_url(generic_url)
143
  try:
144
  if is_youtube:
145
- docs = load_youtube_docs(generic_url)
146
  else:
147
  docs = build_web_loader(generic_url).load()
148
  except RequestException:
 
1
  import os
2
+ import html
3
+ import json
4
+ import re
5
+ import xml.etree.ElementTree as ET
6
  from urllib.parse import parse_qs, urlparse
7
 
8
  import certifi
9
+ import httpx
10
  import streamlit as st
11
  import validators
12
  from langchain_community.document_loaders import UnstructuredURLLoader
 
28
  # Sidebar
29
  with st.sidebar:
30
  groq_api_key = st.text_input("Groq API Key", value="", type="password")
31
+ youtube_debug = st.checkbox("Debug YouTube Reachability", value=False)
32
 
33
  generic_url = st.text_input("URL", label_visibility="collapsed")
34
 
 
38
  """
39
  prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
40
 
41
+ LANGUAGE_PREFERENCE = ["en", "en-US", "en-orig", "en-GB"]
42
+ SUBTITLE_EXT_PREFERENCE = ["json3", "srv3", "vtt", "ttml", "srv1", "srv2"]
43
+
44
 
45
  def is_youtube_url(url: str) -> bool:
46
  return "youtube.com" in url or "youtu.be" in url
 
79
  )
80
 
81
 
82
+ def normalize_text(text: str) -> str:
83
+ return re.sub(r"\s+", " ", html.unescape(text or "")).strip()
84
+
 
85
 
86
+ def debug_youtube_reachability(video_id: str) -> None:
87
  try:
88
+ response = httpx.get(f"https://www.youtube.com/watch?v={video_id}", timeout=10)
89
+ st.info(f"YouTube reachable: {response.status_code}")
90
+ except Exception as exc:
91
+ st.warning(f"YouTube blocked or unreachable: {exc}")
92
+
93
+
94
+ def parse_json3_subtitle(payload: str) -> str:
95
+ try:
96
+ data = json.loads(payload)
97
+ except json.JSONDecodeError:
98
+ return ""
99
+
100
+ parts = []
101
+ for event in data.get("events", []):
102
+ for segment in event.get("segs", []):
103
+ piece = segment.get("utf8", "")
104
+ if piece:
105
+ parts.append(piece.replace("\n", " "))
106
+
107
+ return normalize_text(" ".join(parts))
108
+
109
+
110
+ def parse_xml_subtitle(payload: str) -> str:
111
+ try:
112
+ root = ET.fromstring(payload)
113
+ except ET.ParseError:
114
+ return ""
115
+
116
+ chunks = []
117
+ for node in root.iter():
118
+ tag = node.tag.lower() if isinstance(node.tag, str) else ""
119
+ if tag.endswith("text") or tag.endswith("p"):
120
+ text_value = "".join(node.itertext())
121
+ if text_value:
122
+ chunks.append(text_value.replace("\n", " "))
123
+
124
+ return normalize_text(" ".join(chunks))
125
+
126
+
127
+ def parse_vtt_subtitle(payload: str) -> str:
128
+ lines = []
129
+ for raw_line in payload.splitlines():
130
+ line = raw_line.strip().lstrip("\ufeff")
131
+ if not line or line.upper().startswith("WEBVTT"):
132
+ continue
133
+ if "-->" in line:
134
+ continue
135
+ if re.match(r"^\d+$", line):
136
+ continue
137
+ if line.startswith(("NOTE", "STYLE", "REGION")):
138
+ continue
139
+ lines.append(line)
140
+
141
+ return normalize_text(" ".join(lines))
142
+
143
+
144
+ def fetch_subtitle_track(track_url: str, ext: str) -> str:
145
+ response = httpx.get(track_url, timeout=15)
146
+ response.raise_for_status()
147
+ payload = response.text
148
+
149
+ ext_lower = (ext or "").lower()
150
+ if ext_lower == "json3":
151
+ return parse_json3_subtitle(payload)
152
+ if ext_lower in {"srv3", "ttml", "xml"}:
153
+ return parse_xml_subtitle(payload)
154
+ if ext_lower in {"vtt", "srv1", "srv2"}:
155
+ return parse_vtt_subtitle(payload)
156
+
157
+ for parser in (parse_json3_subtitle, parse_xml_subtitle, parse_vtt_subtitle):
158
+ text = parser(payload)
159
+ if text:
160
+ return text
161
+
162
+ return ""
163
+
164
+
165
+ def extract_ydlp_subtitles(video_info: dict) -> str:
166
+ ext_rank = {ext: idx for idx, ext in enumerate(SUBTITLE_EXT_PREFERENCE)}
167
+ candidates = []
168
+
169
+ subtitle_maps = [
170
+ video_info.get("subtitles", {}) or {},
171
+ video_info.get("automatic_captions", {}) or {},
172
+ ]
173
+
174
+ for subtitle_map in subtitle_maps:
175
+ for lang, entries in subtitle_map.items():
176
+ language_rank = (
177
+ LANGUAGE_PREFERENCE.index(lang)
178
+ if lang in LANGUAGE_PREFERENCE
179
+ else len(LANGUAGE_PREFERENCE)
180
+ )
181
+
182
+ if not isinstance(entries, list):
183
+ continue
184
+
185
+ for entry in entries:
186
+ track_url = entry.get("url")
187
+ ext = (entry.get("ext") or "").lower()
188
+ if not track_url:
189
+ continue
190
+ candidates.append(
191
+ (
192
+ language_rank,
193
+ ext_rank.get(ext, len(SUBTITLE_EXT_PREFERENCE)),
194
+ track_url,
195
+ ext,
196
+ )
197
+ )
198
+
199
+ candidates.sort(key=lambda item: (item[0], item[1]))
200
+
201
+ for _, _, track_url, ext in candidates:
202
  try:
203
+ transcript_text = fetch_subtitle_track(track_url, ext)
204
+ if transcript_text:
205
+ return transcript_text
206
  except Exception:
207
+ continue
208
+
209
+ return ""
210
+
211
+
212
+ def fetch_transcript_with_proxy(video_id: str) -> str:
213
+ proxy_url = os.getenv("YOUTUBE_PROXY_URL", "").strip()
214
+ proxy_username = os.getenv("WEBSHARE_PROXY_USERNAME", "").strip()
215
+ proxy_password = os.getenv("WEBSHARE_PROXY_PASSWORD", "").strip()
216
+
217
+ if proxy_username and proxy_password:
218
+ try:
219
+ from youtube_transcript_api.proxies import WebshareProxyConfig
220
+
221
+ transcript_api = YouTubeTranscriptApi(
222
+ proxies=WebshareProxyConfig(
223
+ proxy_username=proxy_username,
224
+ proxy_password=proxy_password,
225
+ )
226
+ )
227
+ transcript_data = transcript_api.fetch(video_id)
228
+ transcript_text = normalize_text(
229
+ " ".join(
230
+ item.text if hasattr(item, "text") else item.get("text", "")
231
+ for item in transcript_data
232
+ )
233
  )
234
+ if transcript_text:
235
+ return transcript_text
236
+ except Exception:
237
+ pass
238
+
239
+ proxy_kwargs = {}
240
+ if proxy_url:
241
+ proxy_kwargs["proxies"] = {"https": proxy_url, "http": proxy_url}
242
+
243
+ attempts = [
244
+ {"languages": ["en", "en-US"], **proxy_kwargs},
245
+ {"languages": ["en"], **proxy_kwargs},
246
+ {"languages": ["en", "en-US"]},
247
+ ]
248
+
249
+ for kwargs in attempts:
250
+ call_kwargs = dict(kwargs)
251
+ try:
252
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, **call_kwargs)
253
+ except TypeError:
254
+ call_kwargs.pop("proxies", None)
255
+ try:
256
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, **call_kwargs)
257
+ except Exception:
258
+ continue
259
+ except Exception:
260
+ continue
261
+
262
+ transcript_text = normalize_text(" ".join(item.get("text", "") for item in transcript))
263
+ if transcript_text:
264
+ return transcript_text
265
+
266
+ return ""
267
+
268
+
269
+ def load_youtube_docs(url: str, debug_reachability: bool = False) -> list[Document]:
270
+ video_id = extract_video_id(url)
271
+ if not video_id:
272
+ raise ValueError("Could not parse YouTube video id from URL.")
273
+
274
+ if debug_reachability:
275
+ debug_youtube_reachability(video_id)
276
 
277
+ try:
278
+ ydl_options = {
279
+ "quiet": True,
280
+ "skip_download": True,
281
+ "noplaylist": True,
282
+ "socket_timeout": 20,
283
+ "writesubtitles": True,
284
+ "writeautomaticsub": True,
285
+ "subtitleslangs": ["en", "en-US"],
286
+ "subtitlesformat": "json3",
287
+ }
288
+ with YoutubeDL(ydl_options) as ydl:
289
+ video_info = ydl.extract_info(url, download=False)
290
+ except Exception as exc:
291
+ transcript_text = fetch_transcript_with_proxy(video_id)
292
  if transcript_text:
293
  return [
294
  Document(
295
  page_content=transcript_text,
296
+ metadata={"source": url, "kind": "transcript_proxy"},
297
  )
298
  ]
299
+ raise RuntimeError(f"YouTube fetch failed: {exc}") from exc
300
 
301
+ transcript_text = extract_ydlp_subtitles(video_info)
302
+ if transcript_text:
303
+ return [
304
+ Document(
305
+ page_content=transcript_text,
306
+ metadata={"source": url, "kind": "transcript"},
307
+ )
308
+ ]
309
+
310
+ transcript_text = fetch_transcript_with_proxy(video_id)
311
+ if transcript_text:
312
+ return [
313
+ Document(
314
+ page_content=transcript_text,
315
+ metadata={"source": url, "kind": "transcript_proxy"},
316
+ )
317
+ ]
318
+
319
+ st.warning("No subtitles found. Falling back to video metadata.")
320
 
321
  title = (video_info.get("title") or "").strip()
322
  description = (video_info.get("description") or "").strip()
 
350
  is_youtube = is_youtube_url(generic_url)
351
  try:
352
  if is_youtube:
353
+ docs = load_youtube_docs(generic_url, debug_reachability=youtube_debug)
354
  else:
355
  docs = build_web_loader(generic_url).load()
356
  except RequestException:
requirements.txt CHANGED
@@ -10,4 +10,5 @@ unstructured==0.16.4
10
  youtube-transcript-api==0.6.3
11
  yt-dlp==2025.3.31
12
  requests==2.32.3
 
13
  certifi==2025.1.31
 
10
  youtube-transcript-api==0.6.3
11
  yt-dlp==2025.3.31
12
  requests==2.32.3
13
+ httpx==0.27.2
14
  certifi==2025.1.31