brotoo commited on
Commit
d7f53b3
·
verified ·
1 Parent(s): e5b518f

Upload 7 files

Browse files
Files changed (7) hide show
  1. Dockerfile +17 -0
  2. README.md +12 -10
  3. app.py +290 -0
  4. news.py +19 -0
  5. requirements.txt +13 -0
  6. utils.py +145 -0
  7. video.py +63 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # System deps for readability/yt_dlp/whisper
6
+ RUN apt-get update && \
7
+ apt-get install -y ffmpeg libxml2 libxslt1.1 libffi-dev && \
8
+ rm -rf /var/lib/apt/lists/*
9
+
10
+ COPY requirements.txt ./requirements.txt
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ COPY app.py ./app.py
14
+
15
+ EXPOSE 7860
16
+
17
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
README.md CHANGED
@@ -1,11 +1,13 @@
1
- ---
2
- title: SUMA
3
- emoji: 💻
4
- colorFrom: pink
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
1
+ # News and Video Summarizer (FastAPI on Hugging Face Spaces)
 
 
 
 
 
 
 
 
2
 
3
+ ## Features
4
+ - POST `/summarize-news` with `{"url": "<cnn/bbc/nbc link>"}` → JSON summary.
5
+ - POST `/summarize-video` with `{"url": "<youtube link>"}` → transcribe (Whisper base) then summarize.
6
+ - GET `/` returns basic status; GET `/health` returns healthy.
7
+ - CORS open to all origins.
8
+
9
+ ## Run locally
10
+ ```bash
11
+ python -m venv .venv && source .venv/bin/activate
12
+ pip install -r requirements.txt
13
+ python app.py # or uvicorn app:app --host 0.0.0.0 --port 7860
app.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+ import shutil
5
+ import tempfile
6
+ from typing import Any, Dict, List
7
+ from urllib.parse import urlparse
8
+
9
+ import requests
10
+ import uvicorn
11
+ from bs4 import BeautifulSoup
12
+ from fastapi import FastAPI, HTTPException
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+ from pydantic import BaseModel, HttpUrl
15
+ from readability import Document
16
+ from transformers import pipeline
17
+ from yt_dlp import YoutubeDL
18
+
19
+ # Optional cache dir to avoid re-downloading models on restarts
20
+ os.environ.setdefault("HF_HOME", "/data/hf_cache")
21
+
22
+ logging.basicConfig(
23
+ level=logging.INFO,
24
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
25
+ )
26
+ logger = logging.getLogger("app")
27
+
28
+ # Globals for lazy loading
29
+ summarizer = None
30
+ whisper_model = None
31
+
32
+ MODEL_NAME = "brotoo/BART-NewsSummarizer"
33
+
34
+ ALLOWED_DOMAINS = {
35
+ "cnn.com",
36
+ "www.cnn.com",
37
+ "edition.cnn.com",
38
+ "nbcnews.com",
39
+ "www.nbcnews.com",
40
+ "bbc.com",
41
+ "www.bbc.com",
42
+ "bbc.co.uk",
43
+ "www.bbc.co.uk",
44
+ }
45
+
46
+
47
+ class SummarizeNewsRequest(BaseModel):
48
+ url: HttpUrl
49
+
50
+
51
+ class SummarizeVideoRequest(BaseModel):
52
+ url: HttpUrl
53
+
54
+
55
+ def is_valid_news_url(url: str) -> bool:
56
+ try:
57
+ parsed = urlparse(url)
58
+ return parsed.scheme in {"http", "https"} and parsed.netloc.lower() in ALLOWED_DOMAINS
59
+ except Exception:
60
+ logger.exception("URL validation failed for %s", url)
61
+ return False
62
+
63
+
64
+ def clean_text(text: str) -> str:
65
+ if not text:
66
+ return ""
67
+ text = re.sub(r"\s+", " ", text)
68
+ return text.strip()
69
+
70
+
71
+ def clean_html(raw_html: str) -> str:
72
+ soup = BeautifulSoup(raw_html or "", "html.parser")
73
+ for tag in soup(["script", "style", "noscript"]):
74
+ tag.extract()
75
+ return clean_text(soup.get_text(" ", strip=True))
76
+
77
+
78
+ def extract_article_content(url: str) -> str:
79
+ article_text = ""
80
+ try:
81
+ headers = {"User-Agent": "Mozilla/5.0"}
82
+ response = requests.get(url, timeout=12, headers=headers)
83
+ response.raise_for_status()
84
+ html = response.text
85
+ document = Document(html)
86
+ article_text = clean_html(document.summary())
87
+ if not article_text:
88
+ soup = BeautifulSoup(html, "html.parser")
89
+ paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
90
+ article_text = clean_text(" ".join(paragraphs))
91
+ logger.info("Article scraped with readability/BeautifulSoup")
92
+ except Exception:
93
+ logger.exception("Article scraping failed")
94
+ return article_text
95
+
96
+
97
+ def chunk_text(text: str, max_words: int = 800) -> List[str]:
98
+ words = text.split()
99
+ if not words:
100
+ return []
101
+ return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]
102
+
103
+
104
+ def summarize_text(text: str, model_pipeline) -> str:
105
+ chunks = chunk_text(text)
106
+ if not chunks:
107
+ return ""
108
+ partials: List[str] = []
109
+ for chunk in chunks:
110
+ try:
111
+ summary = model_pipeline(
112
+ chunk,
113
+ max_length=300,
114
+ min_length=120,
115
+ num_beams=4,
116
+ no_repeat_ngram_size=3,
117
+ do_sample=False,
118
+ truncation=True,
119
+ )[0]["summary_text"]
120
+ partials.append(clean_text(summary))
121
+ except Exception:
122
+ logger.exception("Summarization failed for chunk")
123
+ merged = clean_text(" ".join(partials))
124
+ if not merged:
125
+ return ""
126
+ if len(partials) == 1:
127
+ return merged
128
+ try:
129
+ final_summary = model_pipeline(
130
+ merged,
131
+ max_length=300,
132
+ min_length=120,
133
+ num_beams=4,
134
+ no_repeat_ngram_size=3,
135
+ do_sample=False,
136
+ truncation=True,
137
+ )[0]["summary_text"]
138
+ return clean_text(final_summary)
139
+ except Exception:
140
+ logger.exception("Final summarization merge failed")
141
+ return merged
142
+
143
+
144
+ def get_summarizer():
145
+ global summarizer
146
+ if summarizer is None:
147
+ logger.info("Loading summarization model: %s", MODEL_NAME)
148
+ summarizer = pipeline(
149
+ "summarization",
150
+ model=MODEL_NAME,
151
+ tokenizer=MODEL_NAME,
152
+ device=-1, # CPU
153
+ )
154
+ logger.info("Summarization model loaded")
155
+ return summarizer
156
+
157
+
158
+ def get_whisper():
159
+ global whisper_model
160
+ if whisper_model is None:
161
+ logger.info("Loading Whisper model: base")
162
+ import whisper # type: ignore
163
+
164
+ whisper_model = whisper.load_model("base", device="cpu")
165
+ logger.info("Whisper model loaded")
166
+ return whisper_model
167
+
168
+
169
+ def temp_audio_path() -> str:
170
+ directory = tempfile.mkdtemp(prefix="yt_audio_")
171
+ return os.path.join(directory, "audio.%(ext)s")
172
+
173
+
174
+ def find_first_wav(path: str) -> str:
175
+ if os.path.isfile(path) and path.lower().endswith(".wav"):
176
+ return path
177
+ if os.path.isdir(path):
178
+ for entry in os.listdir(path):
179
+ candidate = os.path.join(path, entry)
180
+ if os.path.isfile(candidate) and candidate.lower().endswith(".wav"):
181
+ return candidate
182
+ return ""
183
+
184
+
185
+ def download_youtube_audio(url: str) -> str:
186
+ output_template = temp_audio_path()
187
+ temp_dir = os.path.dirname(output_template)
188
+ ydl_opts = {
189
+ "format": "bestaudio/best",
190
+ "outtmpl": output_template,
191
+ "postprocessors": [
192
+ {
193
+ "key": "FFmpegExtractAudio",
194
+ "preferredcodec": "wav",
195
+ "preferredquality": "192",
196
+ }
197
+ ],
198
+ "quiet": True,
199
+ "no_warnings": True,
200
+ }
201
+ with YoutubeDL(ydl_opts) as ydl:
202
+ ydl.download([url])
203
+ wav_path = find_first_wav(temp_dir)
204
+ if not wav_path:
205
+ raise ValueError("Failed to download or convert YouTube audio.")
206
+ return wav_path
207
+
208
+
209
+ app = FastAPI(title="News and Video Summarizer", version="1.0.0")
210
+ app.add_middleware(
211
+ CORSMiddleware,
212
+ allow_origins=["*"],
213
+ allow_credentials=True,
214
+ allow_methods=["*"],
215
+ allow_headers=["*"],
216
+ )
217
+
218
+
219
+ @app.get("/")
220
+ async def root() -> Dict[str, str]:
221
+ return {"status": "ok", "message": "API is running"}
222
+
223
+
224
+ @app.get("/health")
225
+ async def health() -> Dict[str, str]:
226
+ return {"status": "healthy"}
227
+
228
+
229
+ @app.post("/summarize-news")
230
+ async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
231
+ logger.info("Received news summarization request for %s", payload.url)
232
+ if not is_valid_news_url(str(payload.url)):
233
+ raise HTTPException(status_code=400, detail="Unsupported news domain.")
234
+ try:
235
+ model = get_summarizer()
236
+ except Exception as exc:
237
+ logger.exception("Failed to load summarizer")
238
+ return {"error": f"Model load failed: {exc}"}
239
+
240
+ article_text = extract_article_content(str(payload.url))
241
+ if not article_text or len(article_text.split()) < 40:
242
+ raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")
243
+ summary = summarize_text(article_text, model)
244
+ if not summary:
245
+ raise HTTPException(status_code=500, detail="Summarization failed.")
246
+ return {"summary": summary}
247
+
248
+
249
+ @app.post("/summarize-video")
250
+ async def summarize_video(payload: SummarizeVideoRequest) -> Dict[str, Any]:
251
+ logger.info("Received video summarization request for %s", payload.url)
252
+ if not any(host in str(payload.url) for host in ["youtube.com", "youtu.be"]):
253
+ raise HTTPException(status_code=400, detail="Only YouTube links are supported.")
254
+ try:
255
+ model = get_summarizer()
256
+ except Exception as exc:
257
+ logger.exception("Failed to load summarizer")
258
+ return {"error": f"Model load failed: {exc}"}
259
+
260
+ audio_path = ""
261
+ temp_dir = ""
262
+ try:
263
+ whisper = get_whisper()
264
+ audio_path = download_youtube_audio(str(payload.url))
265
+ temp_dir = os.path.dirname(audio_path)
266
+ transcript = whisper.transcribe(audio_path, language="en")
267
+ transcript_text = clean_text(transcript.get("text", ""))
268
+ if not transcript_text:
269
+ raise HTTPException(status_code=500, detail="No transcript text could be produced from the audio.")
270
+ summary = summarize_text(transcript_text, model)
271
+ if not summary:
272
+ raise HTTPException(status_code=500, detail="Summarization failed.")
273
+ return {"summary": summary}
274
+ except HTTPException:
275
+ raise
276
+ except Exception as exc:
277
+ logger.exception("Unexpected error during video summarization")
278
+ return {"error": f"Video summarization failed: {exc}"}
279
+ finally:
280
+ try:
281
+ if audio_path and os.path.exists(audio_path):
282
+ os.remove(audio_path)
283
+ if temp_dir:
284
+ shutil.rmtree(temp_dir, ignore_errors=True)
285
+ except Exception:
286
+ logger.exception("Failed to clean up temporary audio files")
287
+
288
+
289
+ if __name__ == "__main__":
290
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, workers=1)
news.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from utils import extract_article_content, is_valid_news_url, summarize_text
4
+
5
+
6
+ def summarize_news_article(url: str, summarizer) -> str:
7
+ if not is_valid_news_url(url):
8
+ raise ValueError("Unsupported news domain. Only CNN, NBC, or BBC links are allowed.")
9
+
10
+ article_text = extract_article_content(url)
11
+ if not article_text or len(article_text.split()) < 40:
12
+ raise ValueError("Could not extract enough article text to summarize.")
13
+
14
+ logging.info("Generating summary for news article")
15
+ summary = summarize_text(article_text, summarizer)
16
+ if not summary:
17
+ raise ValueError("Summarization failed for the provided article.")
18
+
19
+ return summary
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy<2
2
+ transformers==4.46.1
3
+ torch==2.2.0+cpu
4
+ --extra-index-url https://download.pytorch.org/whl/cpu
5
+ fastapi>=0.115.0
6
+ uvicorn[standard]>=0.30.0
7
+ openai-whisper==20231117
8
+ yt_dlp>=2023.11.16
9
+ readability-lxml>=0.8.1
10
+ beautifulsoup4>=4.12.2
11
+ requests>=2.31.0
12
+ pydantic>=1.10.15
13
+ lxml>=4.9.3
utils.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+ import tempfile
5
+ from typing import List
6
+ from urllib.parse import urlparse
7
+
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ from readability import Document
11
+ from newspaper import Article
12
+
13
+
14
+ ALLOWED_DOMAINS = {
15
+ "cnn.com",
16
+ "www.cnn.com",
17
+ "edition.cnn.com",
18
+ "nbcnews.com",
19
+ "www.nbcnews.com",
20
+ "bbc.com",
21
+ "www.bbc.com",
22
+ "bbc.co.uk",
23
+ "www.bbc.co.uk",
24
+ }
25
+
26
+
27
+ def is_valid_news_url(url: str) -> bool:
28
+ try:
29
+ parsed = urlparse(url)
30
+ return parsed.scheme in {"http", "https"} and parsed.netloc.lower() in ALLOWED_DOMAINS
31
+ except Exception:
32
+ logging.exception("URL validation failed for %s", url)
33
+ return False
34
+
35
+
36
+ def clean_html(raw_html: str) -> str:
37
+ soup = BeautifulSoup(raw_html or "", "html.parser")
38
+ for tag in soup(["script", "style", "noscript"]):
39
+ tag.extract()
40
+ text = soup.get_text(" ", strip=True)
41
+ return clean_text(text)
42
+
43
+
44
+ def clean_text(text: str) -> str:
45
+ if not text:
46
+ return ""
47
+ text = re.sub(r"\s+", " ", text)
48
+ return text.strip()
49
+
50
+
51
+ def extract_article_content(url: str) -> str:
52
+ article_text = ""
53
+ try:
54
+ article = Article(url)
55
+ article.download()
56
+ article.parse()
57
+ article_text = clean_text(article.text)
58
+ logging.info("Article scraped via newspaper3k")
59
+ except Exception:
60
+ logging.exception("Primary article scrape failed, falling back to readability/BeautifulSoup")
61
+
62
+ if article_text:
63
+ return article_text
64
+
65
+ try:
66
+ headers = {"User-Agent": "Mozilla/5.0"}
67
+ response = requests.get(url, timeout=12, headers=headers)
68
+ response.raise_for_status()
69
+ html = response.text
70
+ document = Document(html)
71
+ article_text = clean_html(document.summary())
72
+ if not article_text:
73
+ soup = BeautifulSoup(html, "html.parser")
74
+ paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
75
+ article_text = clean_text(" ".join(paragraphs))
76
+ except Exception:
77
+ logging.exception("Fallback scraping failed")
78
+
79
+ return article_text
80
+
81
+
82
+ def chunk_text(text: str, max_words: int = 800) -> List[str]:
83
+ words = text.split()
84
+ if not words:
85
+ return []
86
+ chunks: List[str] = []
87
+ for i in range(0, len(words), max_words):
88
+ chunks.append(" ".join(words[i : i + max_words]))
89
+ return chunks
90
+
91
+
92
+ def summarize_text(text: str, summarizer) -> str:
93
+ chunks = chunk_text(text)
94
+ if not chunks:
95
+ return ""
96
+
97
+ partial_summaries: List[str] = []
98
+ for chunk in chunks:
99
+ try:
100
+ summary = summarizer(
101
+ chunk,
102
+ max_length=300,
103
+ min_length=120,
104
+ do_sample=False,
105
+ truncation=True,
106
+ )[0]["summary_text"]
107
+ partial_summaries.append(clean_text(summary))
108
+ except Exception:
109
+ logging.exception("Summarization failed for chunk")
110
+
111
+ merged = clean_text(" ".join(partial_summaries))
112
+ if not merged:
113
+ return ""
114
+
115
+ if len(partial_summaries) == 1:
116
+ return merged
117
+
118
+ try:
119
+ final_summary = summarizer(
120
+ merged,
121
+ max_length=300,
122
+ min_length=120,
123
+ do_sample=False,
124
+ truncation=True,
125
+ )[0]["summary_text"]
126
+ return clean_text(final_summary)
127
+ except Exception:
128
+ logging.exception("Final summarization merge failed")
129
+ return merged
130
+
131
+
132
+ def find_first_wav(path: str) -> str:
133
+ if os.path.isfile(path) and path.lower().endswith(".wav"):
134
+ return path
135
+ if os.path.isdir(path):
136
+ for entry in os.listdir(path):
137
+ candidate = os.path.join(path, entry)
138
+ if os.path.isfile(candidate) and candidate.lower().endswith(".wav"):
139
+ return candidate
140
+ return ""
141
+
142
+
143
+ def temp_audio_path() -> str:
144
+ directory = tempfile.mkdtemp(prefix="yt_audio_")
145
+ return os.path.join(directory, "audio.%(ext)s")
video.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import shutil
4
+ import tempfile
5
+
6
+ from yt_dlp import YoutubeDL
7
+
8
+ from utils import clean_text, find_first_wav, summarize_text, temp_audio_path
9
+
10
+
11
+ def _download_youtube_audio(url: str) -> str:
12
+ output_template = temp_audio_path()
13
+ temp_dir = os.path.dirname(output_template)
14
+ ydl_opts = {
15
+ "format": "bestaudio/best",
16
+ "outtmpl": output_template,
17
+ "postprocessors": [
18
+ {
19
+ "key": "FFmpegExtractAudio",
20
+ "preferredcodec": "wav",
21
+ "preferredquality": "192",
22
+ }
23
+ ],
24
+ "quiet": True,
25
+ "no_warnings": True,
26
+ }
27
+ with YoutubeDL(ydl_opts) as ydl:
28
+ ydl.download([url])
29
+
30
+ wav_path = find_first_wav(temp_dir)
31
+ if not wav_path:
32
+ raise ValueError("Failed to download or convert YouTube audio.")
33
+ return wav_path
34
+
35
+
36
+ def summarize_video_url(url: str, summarizer, whisper_model) -> str:
37
+ if not any(host in url for host in ["youtube.com", "youtu.be"]):
38
+ raise ValueError("Only YouTube links are supported.")
39
+
40
+ audio_path = ""
41
+ temp_dir = ""
42
+ try:
43
+ audio_path = _download_youtube_audio(url)
44
+ temp_dir = os.path.dirname(audio_path)
45
+ logging.info("Transcribing audio with Whisper")
46
+ transcript = whisper_model.transcribe(audio_path, language="en")
47
+ transcript_text = clean_text(transcript.get("text", ""))
48
+ if not transcript_text:
49
+ raise ValueError("No transcript text could be produced from the audio.")
50
+
51
+ logging.info("Generating summary for video transcript")
52
+ summary = summarize_text(transcript_text, summarizer)
53
+ if not summary:
54
+ raise ValueError("Summarization failed for the provided video.")
55
+ return summary
56
+ finally:
57
+ try:
58
+ if audio_path and os.path.exists(audio_path):
59
+ os.remove(audio_path)
60
+ if temp_dir:
61
+ shutil.rmtree(temp_dir, ignore_errors=True)
62
+ except Exception:
63
+ logging.exception("Failed to clean up temporary audio files")