Vikrant-Honbute commited on
Commit
ba2f4fa
·
1 Parent(s): 2518a8c
Files changed (3) hide show
  1. Dockerfile +5 -1
  2. app.py +98 -36
  3. requirements.txt +13 -27
Dockerfile CHANGED
@@ -1,5 +1,9 @@
1
  FROM python:3.10-slim
2
 
 
 
 
 
3
  # Create HF-compatible user
4
  RUN useradd -m -u 1000 user
5
  USER user
@@ -10,7 +14,7 @@ ENV HOME=/home/user \
10
  WORKDIR $HOME/app
11
 
12
  COPY --chown=user requirements.txt .
13
- RUN pip install --no-cache-dir -r requirements.txt
14
 
15
  COPY --chown=user . .
16
 
 
1
  FROM python:3.10-slim
2
 
3
+ RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates \
4
+ && update-ca-certificates \
5
+ && rm -rf /var/lib/apt/lists/*
6
+
7
  # Create HF-compatible user
8
  RUN useradd -m -u 1000 user
9
  USER user
 
14
  WORKDIR $HOME/app
15
 
16
  COPY --chown=user requirements.txt .
17
+ RUN pip install --no-cache-dir --upgrade pip && pip install --no-cache-dir -r requirements.txt
18
 
19
  COPY --chown=user . .
20
 
app.py CHANGED
@@ -1,10 +1,20 @@
1
- import validators, streamlit as st
 
 
 
 
 
 
 
 
2
  from langchain_core.prompts import PromptTemplate
3
  from langchain_groq import ChatGroq
4
- from langchain_classic.chains.summarize import load_summarize_chain
5
- from langchain_community.document_loaders import YoutubeLoader, UnstructuredURLLoader
6
- import urllib.error
7
- from requests.exceptions import RequestException, SSLError
 
 
8
 
9
  # Streamlit App
10
  st.set_page_config(page_title="SnapSummaryAI — YouTube & Web Summarizer", page_icon="🦜")
@@ -28,6 +38,25 @@ def is_youtube_url(url: str) -> bool:
28
  return "youtube.com" in url or "youtu.be" in url
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def build_web_loader(url: str) -> UnstructuredURLLoader:
32
  return UnstructuredURLLoader(
33
  urls=[url],
@@ -41,6 +70,61 @@ def build_web_loader(url: str) -> UnstructuredURLLoader:
41
  },
42
  )
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  if st.button("Summarize the Content from YT or Website"):
45
  # Validate inputs
46
  if not groq_api_key.strip() or not generic_url.strip():
@@ -59,42 +143,20 @@ if st.button("Summarize the Content from YT or Website"):
59
  is_youtube = is_youtube_url(generic_url)
60
  try:
61
  if is_youtube:
62
- loader = YoutubeLoader.from_youtube_url(
63
- generic_url,
64
- add_video_info=False
65
- )
66
  else:
67
- loader = build_web_loader(generic_url)
68
-
69
- docs = loader.load()
70
- except SSLError:
71
- if is_youtube:
72
- st.warning(
73
- "YouTube transcript fetch failed due to an SSL/TLS issue. "
74
- "Trying webpage-text fallback..."
75
- )
76
- try:
77
- docs = build_web_loader(generic_url).load()
78
- except Exception:
79
- st.error(
80
- "Could not reach YouTube from this environment due to SSL/network restrictions. "
81
- "Please retry later or try a non-YouTube URL."
82
- )
83
- st.stop()
84
- else:
85
- st.error("SSL/network issue while loading the URL. Please try again.")
86
- st.stop()
87
- except urllib.error.HTTPError:
88
- st.error(
89
- "YouTube returned a 400 Bad Request. "
90
- "Try a different video URL (non-private, non-short)."
91
- )
92
- st.stop()
93
  except RequestException:
94
  st.error("Network error while loading the URL. Please try again.")
95
  st.stop()
96
  except Exception as exc:
97
- st.error(f"Could not load content from this URL. Details: {exc}")
 
 
 
 
 
 
98
  st.stop()
99
 
100
  if not docs:
 
1
+ import os
2
+ from urllib.parse import parse_qs, urlparse
3
+
4
+ import certifi
5
+ import streamlit as st
6
+ import validators
7
+ from langchain_classic.chains.summarize import load_summarize_chain
8
+ from langchain_community.document_loaders import UnstructuredURLLoader
9
+ from langchain_core.documents import Document
10
  from langchain_core.prompts import PromptTemplate
11
  from langchain_groq import ChatGroq
12
+ from requests.exceptions import RequestException
13
+ from youtube_transcript_api import YouTubeTranscriptApi
14
+ from yt_dlp import YoutubeDL
15
+
16
+ os.environ.setdefault("SSL_CERT_FILE", certifi.where())
17
+ os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())
18
 
19
  # Streamlit App
20
  st.set_page_config(page_title="SnapSummaryAI — YouTube & Web Summarizer", page_icon="🦜")
 
38
  return "youtube.com" in url or "youtu.be" in url
39
 
40
 
41
+ def extract_video_id(url: str) -> str | None:
42
+ parsed_url = urlparse(url)
43
+ host = (parsed_url.hostname or "").lower()
44
+
45
+ if host in {"youtu.be", "www.youtu.be"}:
46
+ return parsed_url.path.strip("/") or None
47
+
48
+ if "youtube.com" in host:
49
+ if parsed_url.path == "/watch":
50
+ return parse_qs(parsed_url.query).get("v", [None])[0]
51
+
52
+ if parsed_url.path.startswith("/shorts/") or parsed_url.path.startswith("/embed/"):
53
+ path_parts = [part for part in parsed_url.path.split("/") if part]
54
+ if len(path_parts) >= 2:
55
+ return path_parts[1]
56
+
57
+ return None
58
+
59
+
60
  def build_web_loader(url: str) -> UnstructuredURLLoader:
61
  return UnstructuredURLLoader(
62
  urls=[url],
 
70
  },
71
  )
72
 
73
+
74
+ def load_youtube_docs(url: str) -> list[Document]:
75
+ video_id = extract_video_id(url)
76
+ if not video_id:
77
+ raise ValueError("Could not parse YouTube video id from URL.")
78
+
79
+ transcript = None
80
+ try:
81
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en", "en-US"])
82
+ except Exception:
83
+ try:
84
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
85
+ except Exception:
86
+ st.warning(
87
+ "YouTube transcript fetch failed (often network/IP restriction). "
88
+ "Trying video metadata fallback..."
89
+ )
90
+
91
+ if transcript:
92
+ transcript_text = " ".join(item.get("text", "") for item in transcript).strip()
93
+ if transcript_text:
94
+ return [
95
+ Document(
96
+ page_content=transcript_text,
97
+ metadata={"source": url, "kind": "transcript"},
98
+ )
99
+ ]
100
+
101
+ try:
102
+ with YoutubeDL(
103
+ {
104
+ "quiet": True,
105
+ "skip_download": True,
106
+ "noplaylist": True,
107
+ "socket_timeout": 20,
108
+ }
109
+ ) as ydl:
110
+ video_info = ydl.extract_info(url, download=False)
111
+ except Exception as exc:
112
+ raise RuntimeError(f"YouTube transcript and metadata fetch both failed: {exc}") from exc
113
+
114
+ title = (video_info.get("title") or "").strip()
115
+ description = (video_info.get("description") or "").strip()
116
+ fallback_text = f"Title: {title}\n\nDescription:\n{description}".strip()
117
+
118
+ if fallback_text.replace("Title:", "").replace("Description:", "").strip():
119
+ return [
120
+ Document(
121
+ page_content=fallback_text,
122
+ metadata={"source": url, "kind": "metadata"},
123
+ )
124
+ ]
125
+
126
+ raise RuntimeError("No text could be extracted from this YouTube URL.")
127
+
128
  if st.button("Summarize the Content from YT or Website"):
129
  # Validate inputs
130
  if not groq_api_key.strip() or not generic_url.strip():
 
143
  is_youtube = is_youtube_url(generic_url)
144
  try:
145
  if is_youtube:
146
+ docs = load_youtube_docs(generic_url)
 
 
 
147
  else:
148
+ docs = build_web_loader(generic_url).load()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  except RequestException:
150
  st.error("Network error while loading the URL. Please try again.")
151
  st.stop()
152
  except Exception as exc:
153
+ if is_youtube:
154
+ st.error(
155
+ "Could not extract YouTube transcript or metadata in this environment. "
156
+ f"Details: {exc}"
157
+ )
158
+ else:
159
+ st.error(f"Could not load content from this URL. Details: {exc}")
160
  st.stop()
161
 
162
  if not docs:
requirements.txt CHANGED
@@ -1,28 +1,14 @@
1
- langchain
2
- python-dotenv
3
- openai
4
- ipykernel
5
- langchain-community
6
- PyPDF
7
- bs4 #beautiful soup
8
- langchain-text-splitters
9
- langchain-openai
10
- chromadb
11
- sentence_transformers
12
- langchain_huggingface
13
- langchain_chroma
14
- langchain-openai
15
- streamlit
16
- langchain_groq
17
- langchain_core
18
- fastapi
19
- uvicorn
20
- langserve
21
- langchain-classic
22
- arxiv
23
- wikipedia
24
- langchain
25
  validators==0.28.1
26
- youtube_transcript_api
27
- streamlit
28
- unstructured
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.39.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  validators==0.28.1
3
+
4
+ langchain==0.3.7
5
+ langchain-core==0.3.18
6
+ langchain-community==0.3.7
7
+ langchain-classic==0.3.0
8
+ langchain-groq==0.2.1
9
+
10
+ unstructured==0.16.4
11
+ youtube-transcript-api==0.6.3
12
+ yt-dlp==2025.3.31
13
+ requests==2.32.3
14
+ certifi==2025.1.31