brotoo commited on
Commit
d51a2c0
·
verified ·
1 Parent(s): e43aa6e

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +86 -142
  2. requirements.txt +9 -8
app.py CHANGED
@@ -1,25 +1,19 @@
1
  import logging
2
  import os
3
  import re
 
4
  from typing import Any, Dict, List
5
- from urllib.parse import parse_qs, urlparse
6
 
7
  import requests
8
  import uvicorn
9
  from bs4 import BeautifulSoup
10
- from fastapi import FastAPI, HTTPException
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from pydantic import BaseModel, HttpUrl
13
  from readability import Document
14
  from transformers import pipeline
15
- from youtube_transcript_api import (
16
- NoTranscriptFound,
17
- TranscriptsDisabled,
18
- VideoUnavailable,
19
- YouTubeTranscriptApi,
20
- )
21
 
22
- # Optional cache dir to avoid re-downloading models on restarts
23
  os.environ.setdefault("HF_HOME", "/data/hf_cache")
24
 
25
  logging.basicConfig(
@@ -28,40 +22,22 @@ logging.basicConfig(
28
  )
29
  logger = logging.getLogger("app")
30
 
31
- # Globals for lazy loading
32
  summarizer = None
 
33
 
34
  MODEL_NAME = "brotoo/BART-NewsSummarizer"
35
 
36
  ALLOWED_DOMAINS = {
37
- "cnn.com",
38
- "www.cnn.com",
39
- "edition.cnn.com",
40
- "nbcnews.com",
41
- "www.nbcnews.com",
42
- "bbc.com",
43
- "www.bbc.com",
44
- "bbc.co.uk",
45
- "www.bbc.co.uk",
46
  }
47
 
48
-
49
  class SummarizeNewsRequest(BaseModel):
50
  url: HttpUrl
51
 
52
 
53
- class SummarizeVideoRequest(BaseModel):
54
- url: HttpUrl
55
-
56
-
57
- def is_valid_news_url(url: str) -> bool:
58
- try:
59
- parsed = urlparse(url)
60
- return parsed.scheme in {"http", "https"} and parsed.netloc.lower() in ALLOWED_DOMAINS
61
- except Exception:
62
- logger.exception("URL validation failed for %s", url)
63
- return False
64
-
65
 
66
  def clean_text(text: str) -> str:
67
  if not text:
@@ -77,20 +53,22 @@ def clean_html(raw_html: str) -> str:
77
  return clean_text(soup.get_text(" ", strip=True))
78
 
79
 
 
 
80
  def extract_article_content(url: str) -> str:
81
  article_text = ""
82
  try:
83
  headers = {"User-Agent": "Mozilla/5.0"}
84
- response = requests.get(url, timeout=12, headers=headers)
85
- response.raise_for_status()
86
- html = response.text
87
  document = Document(html)
88
  article_text = clean_html(document.summary())
89
  if not article_text:
 
90
  soup = BeautifulSoup(html, "html.parser")
91
  paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
92
  article_text = clean_text(" ".join(paragraphs))
93
- logger.info("Article scraped with readability/BeautifulSoup")
94
  except Exception:
95
  logger.exception("Article scraping failed")
96
  return article_text
@@ -100,14 +78,12 @@ def chunk_text(text: str, max_words: int = 800) -> List[str]:
100
  words = text.split()
101
  if not words:
102
  return []
103
- return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]
104
 
105
 
106
  def summarize_text(text: str, model_pipeline) -> str:
107
  chunks = chunk_text(text)
108
- if not chunks:
109
- return ""
110
- partials: List[str] = []
111
  for chunk in chunks:
112
  try:
113
  summary = model_pipeline(
@@ -122,13 +98,13 @@ def summarize_text(text: str, model_pipeline) -> str:
122
  partials.append(clean_text(summary))
123
  except Exception:
124
  logger.exception("Summarization failed for chunk")
 
125
  merged = clean_text(" ".join(partials))
126
- if not merged:
127
- return ""
128
- if len(partials) == 1:
129
  return merged
 
130
  try:
131
- final_summary = model_pipeline(
132
  merged,
133
  max_length=300,
134
  min_length=120,
@@ -137,77 +113,45 @@ def summarize_text(text: str, model_pipeline) -> str:
137
  do_sample=False,
138
  truncation=True,
139
  )[0]["summary_text"]
140
- return clean_text(final_summary)
141
  except Exception:
142
- logger.exception("Final summarization merge failed")
143
  return merged
144
 
145
 
146
  def get_summarizer():
147
  global summarizer
148
  if summarizer is None:
149
- logger.info("Loading summarization model: %s", MODEL_NAME)
150
  summarizer = pipeline(
151
  "summarization",
152
  model=MODEL_NAME,
153
  tokenizer=MODEL_NAME,
154
- device=-1, # CPU
155
  )
156
- logger.info("Summarization model loaded")
157
  return summarizer
158
 
159
 
160
- def extract_youtube_video_id(url: str) -> str:
161
- parsed = urlparse(url)
162
- host = (parsed.hostname or "").lower()
163
- if host in {"youtu.be"}:
164
- return parsed.path.lstrip("/").split("/")[0]
165
- if "youtube.com" in host:
166
- query_params = parse_qs(parsed.query)
167
- video_id = query_params.get("v", [""])[0]
168
- if not video_id and parsed.path.startswith("/shorts/"):
169
- video_id = parsed.path.split("/shorts/", 1)[-1].split("/")[0]
170
- return video_id
171
- return ""
172
-
173
-
174
- def extract_youtube_transcript(url: str) -> str:
175
- import socket
176
- socket.gethostbyname("www.youtube.com")
177
- video_id = extract_youtube_video_id(url)
178
- if not video_id:
179
- raise HTTPException(status_code=400, detail="Invalid YouTube URL.")
180
- try:
181
- # Support multiple youtube-transcript-api versions.
182
- if hasattr(YouTubeTranscriptApi, "get_transcript"):
183
- transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en", "id"])
184
- elif hasattr(YouTubeTranscriptApi, "list_transcripts"):
185
- transcripts = YouTubeTranscriptApi.list_transcripts(video_id)
186
- transcript = transcripts.find_transcript(["en", "id"]).fetch()
187
- else:
188
- raise HTTPException(
189
- status_code=500,
190
- detail="youtube-transcript-api version is too old; please upgrade to >=0.6.0.",
191
- )
192
- text = " ".join(segment.get("text", "") for segment in transcript)
193
- cleaned = clean_text(text)
194
- if not cleaned:
195
- raise HTTPException(status_code=500, detail="Transcript was empty.")
196
- return cleaned
197
- except (NoTranscriptFound, TranscriptsDisabled) as exc:
198
- logger.exception("Transcript unavailable for video %s", video_id)
199
- raise HTTPException(status_code=404, detail=f"Transcript not available: {exc}") from exc
200
- except VideoUnavailable as exc:
201
- logger.exception("Video unavailable: %s", video_id)
202
- raise HTTPException(status_code=404, detail=f"Video unavailable: {exc}") from exc
203
- except HTTPException:
204
- raise
205
- except Exception as exc:
206
- logger.exception("Failed to fetch YouTube transcript")
207
- raise HTTPException(status_code=500, detail=f"Could not fetch transcript: {exc}")
208
-
209
-
210
- app = FastAPI(title="News and Video Summarizer", version="1.0.0")
211
  app.add_middleware(
212
  CORSMiddleware,
213
  allow_origins=["*"],
@@ -217,59 +161,59 @@ app.add_middleware(
217
  )
218
 
219
 
220
- @app.get("/")
221
- async def root() -> Dict[str, str]:
222
- return {"status": "ok", "message": "API is running"}
 
 
 
 
 
223
 
 
 
224
 
225
- @app.get("/health")
226
- async def health() -> Dict[str, str]:
227
- return {"status": "healthy"}
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
  @app.post("/summarize-news")
231
  async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
232
- logger.info("Received news summarization request for %s", payload.url)
233
- if not is_valid_news_url(str(payload.url)):
 
 
 
 
234
  raise HTTPException(status_code=400, detail="Unsupported news domain.")
235
- try:
236
- model = get_summarizer()
237
- except Exception as exc:
238
- logger.exception("Failed to load summarizer")
239
- return {"error": f"Model load failed: {exc}"}
240
 
241
- article_text = extract_article_content(str(payload.url))
 
 
 
 
242
  if not article_text or len(article_text.split()) < 40:
243
  raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")
 
 
244
  summary = summarize_text(article_text, model)
245
  if not summary:
246
  raise HTTPException(status_code=500, detail="Summarization failed.")
247
- return {"summary": summary}
248
-
249
-
250
- @app.post("/summarize-video")
251
- async def summarize_video(payload: SummarizeVideoRequest) -> Dict[str, Any]:
252
- logger.info("Received video summarization request for %s", payload.url)
253
- if not any(host in str(payload.url) for host in ["youtube.com", "youtu.be"]):
254
- raise HTTPException(status_code=400, detail="Only YouTube links are supported.")
255
- try:
256
- model = get_summarizer()
257
- except Exception as exc:
258
- logger.exception("Failed to load summarizer")
259
- return {"error": f"Model load failed: {exc}"}
260
-
261
- try:
262
- transcript_text = extract_youtube_transcript(str(payload.url))
263
- summary = summarize_text(transcript_text, model)
264
- if not summary:
265
- raise HTTPException(status_code=500, detail="Summarization failed.")
266
- return {"summary": summary}
267
- except HTTPException:
268
- raise
269
- except Exception as exc:
270
- logger.exception("Unexpected error during video summarization")
271
- return {"error": f"Video summarization failed: {exc}"}
272
-
273
 
274
- if __name__ == "__main__":
275
- uvicorn.run("app:app", host="0.0.0.0", port=7860, workers=1)
 
1
  import logging
2
  import os
3
  import re
4
+ import tempfile
5
  from typing import Any, Dict, List
 
6
 
7
  import requests
8
  import uvicorn
9
  from bs4 import BeautifulSoup
10
+ from fastapi import FastAPI, HTTPException, UploadFile, File
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from pydantic import BaseModel, HttpUrl
13
  from readability import Document
14
  from transformers import pipeline
15
+ import whisper
 
 
 
 
 
16
 
 
17
  os.environ.setdefault("HF_HOME", "/data/hf_cache")
18
 
19
  logging.basicConfig(
 
22
  )
23
  logger = logging.getLogger("app")
24
 
 
25
  summarizer = None
26
+ whisper_model = None
27
 
28
  MODEL_NAME = "brotoo/BART-NewsSummarizer"
29
 
30
  ALLOWED_DOMAINS = {
31
+ "cnn.com", "www.cnn.com", "edition.cnn.com",
32
+ "nbcnews.com", "www.nbcnews.com",
33
+ "bbc.com", "www.bbc.com", "bbc.co.uk", "www.bbc.co.uk",
 
 
 
 
 
 
34
  }
35
 
 
36
  class SummarizeNewsRequest(BaseModel):
37
  url: HttpUrl
38
 
39
 
40
+ # === utility clean text ===
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  def clean_text(text: str) -> str:
43
  if not text:
 
53
  return clean_text(soup.get_text(" ", strip=True))
54
 
55
 
56
+ # === NEWS HANDLER ===
57
+
58
  def extract_article_content(url: str) -> str:
59
  article_text = ""
60
  try:
61
  headers = {"User-Agent": "Mozilla/5.0"}
62
+ res = requests.get(url, timeout=12, headers=headers)
63
+ res.raise_for_status()
64
+ html = res.text
65
  document = Document(html)
66
  article_text = clean_html(document.summary())
67
  if not article_text:
68
+ from bs4 import BeautifulSoup
69
  soup = BeautifulSoup(html, "html.parser")
70
  paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
71
  article_text = clean_text(" ".join(paragraphs))
 
72
  except Exception:
73
  logger.exception("Article scraping failed")
74
  return article_text
 
78
  words = text.split()
79
  if not words:
80
  return []
81
+ return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
82
 
83
 
84
  def summarize_text(text: str, model_pipeline) -> str:
85
  chunks = chunk_text(text)
86
+ partials = []
 
 
87
  for chunk in chunks:
88
  try:
89
  summary = model_pipeline(
 
98
  partials.append(clean_text(summary))
99
  except Exception:
100
  logger.exception("Summarization failed for chunk")
101
+
102
  merged = clean_text(" ".join(partials))
103
+ if len(partials) <= 1:
 
 
104
  return merged
105
+
106
  try:
107
+ final = model_pipeline(
108
  merged,
109
  max_length=300,
110
  min_length=120,
 
113
  do_sample=False,
114
  truncation=True,
115
  )[0]["summary_text"]
116
+ return clean_text(final)
117
  except Exception:
 
118
  return merged
119
 
120
 
121
  def get_summarizer():
122
  global summarizer
123
  if summarizer is None:
124
+ logger.info("Loading summarization model...")
125
  summarizer = pipeline(
126
  "summarization",
127
  model=MODEL_NAME,
128
  tokenizer=MODEL_NAME,
129
+ device=-1
130
  )
131
+ logger.info("Summarizer ready")
132
  return summarizer
133
 
134
 
135
+ # === WHISPER TRANSCRIPTION FOR DIRECT FILE UPLOAD ===
136
+
137
+ def transcribe_uploaded_video(file_path: str) -> str:
138
+ global whisper_model
139
+ if whisper_model is None:
140
+ model_name = os.getenv("WHISPER_MODEL", "small")
141
+ logger.info("Loading Whisper model...")
142
+ whisper_model = whisper.load_model(model_name) # CPU
143
+
144
+ result = whisper_model.transcribe(file_path, fp16=False)
145
+ text = clean_text(result.get("text", ""))
146
+ if not text:
147
+ raise HTTPException(status_code=500, detail="Whisper transcription failed (empty text).")
148
+ return text
149
+
150
+
151
+ # === FASTAPI APP ===
152
+
153
+ app = FastAPI(title="News and Video Summarizer", version="2.0")
154
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  app.add_middleware(
156
  CORSMiddleware,
157
  allow_origins=["*"],
 
161
  )
162
 
163
 
164
+ @app.post("/summarize-upload-video")
165
+ async def summarize_upload_video(file: UploadFile = File(...)) -> Dict[str, Any]:
166
+ """
167
+ Upload video directly (mp4/mov/mkv/m4a/wav),
168
+ transcribe with Whisper → summarize with BART.
169
+ """
170
+ if not file.filename.lower().endswith((".mp4", ".mov", ".mkv", ".m4a", ".wav")):
171
+ raise HTTPException(status_code=400, detail="Only video/audio formats are accepted.")
172
 
173
+ tmp_dir = tempfile.mkdtemp()
174
+ temp_path = os.path.join(tmp_dir, file.filename)
175
 
176
+ try:
177
+ with open(temp_path, "wb") as f:
178
+ f.write(await file.read())
179
 
180
+ transcript = transcribe_uploaded_video(temp_path)
181
+ model = get_summarizer()
182
+
183
+ summary = summarize_text(transcript, model)
184
+ if not summary:
185
+ raise HTTPException(status_code=500, detail="Summarization failed.")
186
+ return {"summary": summary}
187
+
188
+ finally:
189
+ try:
190
+ if os.path.exists(temp_path):
191
+ os.remove(temp_path)
192
+ os.rmdir(tmp_dir)
193
+ except Exception:
194
+ pass
195
 
196
  @app.post("/summarize-news")
197
  async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
198
+ url = str(payload.url)
199
+ logger.info("Received news summarization request for %s", url)
200
+
201
+ # Validasi domain
202
+ parsed = requests.utils.urlparse(url)
203
+ if parsed.netloc not in ALLOWED_DOMAINS:
204
  raise HTTPException(status_code=400, detail="Unsupported news domain.")
 
 
 
 
 
205
 
206
+ # Load model
207
+ model = get_summarizer()
208
+
209
+ # Ekstrak artikel
210
+ article_text = extract_article_content(url)
211
  if not article_text or len(article_text.split()) < 40:
212
  raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")
213
+
214
+ # Summarize
215
  summary = summarize_text(article_text, model)
216
  if not summary:
217
  raise HTTPException(status_code=500, detail="Summarization failed.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
+ return {"summary": summary}
 
requirements.txt CHANGED
@@ -2,11 +2,12 @@ numpy<2
2
  transformers==4.46.1
3
  torch==2.2.0+cpu
4
  --extra-index-url https://download.pytorch.org/whl/cpu
5
- fastapi>=0.115.0
6
- uvicorn[standard]>=0.30.0
7
- readability-lxml>=0.8.1
8
- beautifulsoup4>=4.12.2
9
- requests>=2.31.0
10
- pydantic>=1.10.15
11
- lxml>=4.9.3
12
- youtube-transcript-api>=0.6.2
 
 
2
  transformers==4.46.1
3
  torch==2.2.0+cpu
4
  --extra-index-url https://download.pytorch.org/whl/cpu
5
+ fastapi
6
+ uvicorn[standard]
7
+ python-multipart
8
+ readability-lxml
9
+ beautifulsoup4
10
+ requests
11
+ pydantic
12
+ lxml
13
+ openai-whisper