brotoo commited on
Commit
b7782db
·
verified ·
1 Parent(s): d51a2c0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -16
app.py CHANGED
@@ -27,11 +27,6 @@ whisper_model = None
27
 
28
  MODEL_NAME = "brotoo/BART-NewsSummarizer"
29
 
30
- ALLOWED_DOMAINS = {
31
- "cnn.com", "www.cnn.com", "edition.cnn.com",
32
- "nbcnews.com", "www.nbcnews.com",
33
- "bbc.com", "www.bbc.com", "bbc.co.uk", "www.bbc.co.uk",
34
- }
35
 
36
  class SummarizeNewsRequest(BaseModel):
37
  url: HttpUrl
@@ -65,7 +60,6 @@ def extract_article_content(url: str) -> str:
65
  document = Document(html)
66
  article_text = clean_html(document.summary())
67
  if not article_text:
68
- from bs4 import BeautifulSoup
69
  soup = BeautifulSoup(html, "html.parser")
70
  paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
71
  article_text = clean_text(" ".join(paragraphs))
@@ -78,7 +72,7 @@ def chunk_text(text: str, max_words: int = 800) -> List[str]:
78
  words = text.split()
79
  if not words:
80
  return []
81
- return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
82
 
83
 
84
  def summarize_text(text: str, model_pipeline) -> str:
@@ -139,7 +133,7 @@ def transcribe_uploaded_video(file_path: str) -> str:
139
  if whisper_model is None:
140
  model_name = os.getenv("WHISPER_MODEL", "small")
141
  logger.info("Loading Whisper model...")
142
- whisper_model = whisper.load_model(model_name) # CPU
143
 
144
  result = whisper_model.transcribe(file_path, fp16=False)
145
  text = clean_text(result.get("text", ""))
@@ -164,7 +158,7 @@ app.add_middleware(
164
  @app.post("/summarize-upload-video")
165
  async def summarize_upload_video(file: UploadFile = File(...)) -> Dict[str, Any]:
166
  """
167
- Upload video directly (mp4/mov/mkv/m4a/wav),
168
  transcribe with Whisper → summarize with BART.
169
  """
170
  if not file.filename.lower().endswith((".mp4", ".mov", ".mkv", ".m4a", ".wav")):
@@ -193,25 +187,20 @@ async def summarize_upload_video(file: UploadFile = File(...)) -> Dict[str, Any]
193
  except Exception:
194
  pass
195
 
 
196
  @app.post("/summarize-news")
197
  async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
198
  url = str(payload.url)
199
  logger.info("Received news summarization request for %s", url)
200
 
201
- # Validasi domain
202
- parsed = requests.utils.urlparse(url)
203
- if parsed.netloc not in ALLOWED_DOMAINS:
204
- raise HTTPException(status_code=400, detail="Unsupported news domain.")
205
 
206
- # Load model
207
  model = get_summarizer()
208
 
209
- # Ekstrak artikel
210
  article_text = extract_article_content(url)
211
  if not article_text or len(article_text.split()) < 40:
212
  raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")
213
 
214
- # Summarize
215
  summary = summarize_text(article_text, model)
216
  if not summary:
217
  raise HTTPException(status_code=500, detail="Summarization failed.")
 
27
 
28
  MODEL_NAME = "brotoo/BART-NewsSummarizer"
29
 
 
 
 
 
 
30
 
31
  class SummarizeNewsRequest(BaseModel):
32
  url: HttpUrl
 
60
  document = Document(html)
61
  article_text = clean_html(document.summary())
62
  if not article_text:
 
63
  soup = BeautifulSoup(html, "html.parser")
64
  paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
65
  article_text = clean_text(" ".join(paragraphs))
 
72
  words = text.split()
73
  if not words:
74
  return []
75
+ return [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
76
 
77
 
78
  def summarize_text(text: str, model_pipeline) -> str:
 
133
  if whisper_model is None:
134
  model_name = os.getenv("WHISPER_MODEL", "small")
135
  logger.info("Loading Whisper model...")
136
+ whisper_model = whisper.load_model(model_name)
137
 
138
  result = whisper_model.transcribe(file_path, fp16=False)
139
  text = clean_text(result.get("text", ""))
 
158
  @app.post("/summarize-upload-video")
159
  async def summarize_upload_video(file: UploadFile = File(...)) -> Dict[str, Any]:
160
  """
161
+ Upload video/audio,
162
  transcribe with Whisper → summarize with BART.
163
  """
164
  if not file.filename.lower().endswith((".mp4", ".mov", ".mkv", ".m4a", ".wav")):
 
187
  except Exception:
188
  pass
189
 
190
+
191
  @app.post("/summarize-news")
192
  async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
193
  url = str(payload.url)
194
  logger.info("Received news summarization request for %s", url)
195
 
196
+ # ⛔️ DOMAIN CHECK REMOVED — now accepts any domain
 
 
 
197
 
 
198
  model = get_summarizer()
199
 
 
200
  article_text = extract_article_content(url)
201
  if not article_text or len(article_text.split()) < 40:
202
  raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")
203
 
 
204
  summary = summarize_text(article_text, model)
205
  if not summary:
206
  raise HTTPException(status_code=500, detail="Summarization failed.")