Upload app.py
Browse files
app.py
CHANGED
|
@@ -27,11 +27,6 @@ whisper_model = None
|
|
| 27 |
|
| 28 |
MODEL_NAME = "brotoo/BART-NewsSummarizer"
|
| 29 |
|
| 30 |
-
ALLOWED_DOMAINS = {
|
| 31 |
-
"cnn.com", "www.cnn.com", "edition.cnn.com",
|
| 32 |
-
"nbcnews.com", "www.nbcnews.com",
|
| 33 |
-
"bbc.com", "www.bbc.com", "bbc.co.uk", "www.bbc.co.uk",
|
| 34 |
-
}
|
| 35 |
|
| 36 |
class SummarizeNewsRequest(BaseModel):
|
| 37 |
url: HttpUrl
|
|
@@ -65,7 +60,6 @@ def extract_article_content(url: str) -> str:
|
|
| 65 |
document = Document(html)
|
| 66 |
article_text = clean_html(document.summary())
|
| 67 |
if not article_text:
|
| 68 |
-
from bs4 import BeautifulSoup
|
| 69 |
soup = BeautifulSoup(html, "html.parser")
|
| 70 |
paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
|
| 71 |
article_text = clean_text(" ".join(paragraphs))
|
|
@@ -78,7 +72,7 @@ def chunk_text(text: str, max_words: int = 800) -> List[str]:
|
|
| 78 |
words = text.split()
|
| 79 |
if not words:
|
| 80 |
return []
|
| 81 |
-
return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
|
| 82 |
|
| 83 |
|
| 84 |
def summarize_text(text: str, model_pipeline) -> str:
|
|
@@ -139,7 +133,7 @@ def transcribe_uploaded_video(file_path: str) -> str:
|
|
| 139 |
if whisper_model is None:
|
| 140 |
model_name = os.getenv("WHISPER_MODEL", "small")
|
| 141 |
logger.info("Loading Whisper model...")
|
| 142 |
-
whisper_model = whisper.load_model(model_name)
|
| 143 |
|
| 144 |
result = whisper_model.transcribe(file_path, fp16=False)
|
| 145 |
text = clean_text(result.get("text", ""))
|
|
@@ -164,7 +158,7 @@ app.add_middleware(
|
|
| 164 |
@app.post("/summarize-upload-video")
|
| 165 |
async def summarize_upload_video(file: UploadFile = File(...)) -> Dict[str, Any]:
|
| 166 |
"""
|
| 167 |
-
Upload video
|
| 168 |
transcribe with Whisper → summarize with BART.
|
| 169 |
"""
|
| 170 |
if not file.filename.lower().endswith((".mp4", ".mov", ".mkv", ".m4a", ".wav")):
|
|
@@ -193,25 +187,20 @@ async def summarize_upload_video(file: UploadFile = File(...)) -> Dict[str, Any]
|
|
| 193 |
except Exception:
|
| 194 |
pass
|
| 195 |
|
|
|
|
| 196 |
@app.post("/summarize-news")
|
| 197 |
async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
|
| 198 |
url = str(payload.url)
|
| 199 |
logger.info("Received news summarization request for %s", url)
|
| 200 |
|
| 201 |
-
#
|
| 202 |
-
parsed = requests.utils.urlparse(url)
|
| 203 |
-
if parsed.netloc not in ALLOWED_DOMAINS:
|
| 204 |
-
raise HTTPException(status_code=400, detail="Unsupported news domain.")
|
| 205 |
|
| 206 |
-
# Load model
|
| 207 |
model = get_summarizer()
|
| 208 |
|
| 209 |
-
# Ekstrak artikel
|
| 210 |
article_text = extract_article_content(url)
|
| 211 |
if not article_text or len(article_text.split()) < 40:
|
| 212 |
raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")
|
| 213 |
|
| 214 |
-
# Summarize
|
| 215 |
summary = summarize_text(article_text, model)
|
| 216 |
if not summary:
|
| 217 |
raise HTTPException(status_code=500, detail="Summarization failed.")
|
|
|
|
| 27 |
|
| 28 |
MODEL_NAME = "brotoo/BART-NewsSummarizer"
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
class SummarizeNewsRequest(BaseModel):
|
| 32 |
url: HttpUrl
|
|
|
|
| 60 |
document = Document(html)
|
| 61 |
article_text = clean_html(document.summary())
|
| 62 |
if not article_text:
|
|
|
|
| 63 |
soup = BeautifulSoup(html, "html.parser")
|
| 64 |
paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
|
| 65 |
article_text = clean_text(" ".join(paragraphs))
|
|
|
|
| 72 |
words = text.split()
|
| 73 |
if not words:
|
| 74 |
return []
|
| 75 |
+
return [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
|
| 76 |
|
| 77 |
|
| 78 |
def summarize_text(text: str, model_pipeline) -> str:
|
|
|
|
| 133 |
if whisper_model is None:
|
| 134 |
model_name = os.getenv("WHISPER_MODEL", "small")
|
| 135 |
logger.info("Loading Whisper model...")
|
| 136 |
+
whisper_model = whisper.load_model(model_name)
|
| 137 |
|
| 138 |
result = whisper_model.transcribe(file_path, fp16=False)
|
| 139 |
text = clean_text(result.get("text", ""))
|
|
|
|
| 158 |
@app.post("/summarize-upload-video")
|
| 159 |
async def summarize_upload_video(file: UploadFile = File(...)) -> Dict[str, Any]:
|
| 160 |
"""
|
| 161 |
+
Upload video/audio,
|
| 162 |
transcribe with Whisper → summarize with BART.
|
| 163 |
"""
|
| 164 |
if not file.filename.lower().endswith((".mp4", ".mov", ".mkv", ".m4a", ".wav")):
|
|
|
|
| 187 |
except Exception:
|
| 188 |
pass
|
| 189 |
|
| 190 |
+
|
| 191 |
@app.post("/summarize-news")
|
| 192 |
async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
|
| 193 |
url = str(payload.url)
|
| 194 |
logger.info("Received news summarization request for %s", url)
|
| 195 |
|
| 196 |
+
# ⛔️ DOMAIN CHECK REMOVED — now accepts any domain
|
|
|
|
|
|
|
|
|
|
| 197 |
|
|
|
|
| 198 |
model = get_summarizer()
|
| 199 |
|
|
|
|
| 200 |
article_text = extract_article_content(url)
|
| 201 |
if not article_text or len(article_text.split()) < 40:
|
| 202 |
raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")
|
| 203 |
|
|
|
|
| 204 |
summary = summarize_text(article_text, model)
|
| 205 |
if not summary:
|
| 206 |
raise HTTPException(status_code=500, detail="Summarization failed.")
|