File size: 5,881 Bytes
d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 b7782db d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 b7782db d51a2c0 d7f53b3 d51a2c0 b7782db d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 b7782db d7f53b3 d51a2c0 b7782db d7f53b3 d51a2c0 d7f53b3 d51a2c0 d7f53b3 d51a2c0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | import logging
import os
import re
import tempfile
from typing import Any, Dict, List
import requests
import uvicorn
from bs4 import BeautifulSoup
from fastapi import FastAPI, HTTPException, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, HttpUrl
from readability import Document
from transformers import pipeline
import whisper
os.environ.setdefault("HF_HOME", "/data/hf_cache")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
)
logger = logging.getLogger("app")
summarizer = None
whisper_model = None
MODEL_NAME = "brotoo/BART-NewsSummarizer"
class SummarizeNewsRequest(BaseModel):
url: HttpUrl
# === utility clean text ===
def clean_text(text: str) -> str:
if not text:
return ""
text = re.sub(r"\s+", " ", text)
return text.strip()
def clean_html(raw_html: str) -> str:
soup = BeautifulSoup(raw_html or "", "html.parser")
for tag in soup(["script", "style", "noscript"]):
tag.extract()
return clean_text(soup.get_text(" ", strip=True))
# === NEWS HANDLER ===
def extract_article_content(url: str) -> str:
article_text = ""
try:
headers = {"User-Agent": "Mozilla/5.0"}
res = requests.get(url, timeout=12, headers=headers)
res.raise_for_status()
html = res.text
document = Document(html)
article_text = clean_html(document.summary())
if not article_text:
soup = BeautifulSoup(html, "html.parser")
paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
article_text = clean_text(" ".join(paragraphs))
except Exception:
logger.exception("Article scraping failed")
return article_text
def chunk_text(text: str, max_words: int = 800) -> List[str]:
words = text.split()
if not words:
return []
return [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
def summarize_text(text: str, model_pipeline) -> str:
chunks = chunk_text(text)
partials = []
for chunk in chunks:
try:
summary = model_pipeline(
chunk,
max_length=300,
min_length=120,
num_beams=4,
no_repeat_ngram_size=3,
do_sample=False,
truncation=True,
)[0]["summary_text"]
partials.append(clean_text(summary))
except Exception:
logger.exception("Summarization failed for chunk")
merged = clean_text(" ".join(partials))
if len(partials) <= 1:
return merged
try:
final = model_pipeline(
merged,
max_length=300,
min_length=120,
num_beams=4,
no_repeat_ngram_size=3,
do_sample=False,
truncation=True,
)[0]["summary_text"]
return clean_text(final)
except Exception:
return merged
def get_summarizer():
global summarizer
if summarizer is None:
logger.info("Loading summarization model...")
summarizer = pipeline(
"summarization",
model=MODEL_NAME,
tokenizer=MODEL_NAME,
device=-1
)
logger.info("Summarizer ready")
return summarizer
# === WHISPER TRANSCRIPTION FOR DIRECT FILE UPLOAD ===
def transcribe_uploaded_video(file_path: str) -> str:
global whisper_model
if whisper_model is None:
model_name = os.getenv("WHISPER_MODEL", "small")
logger.info("Loading Whisper model...")
whisper_model = whisper.load_model(model_name)
result = whisper_model.transcribe(file_path, fp16=False)
text = clean_text(result.get("text", ""))
if not text:
raise HTTPException(status_code=500, detail="Whisper transcription failed (empty text).")
return text
# === FASTAPI APP ===
app = FastAPI(title="News and Video Summarizer", version="2.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.post("/summarize-upload-video")
async def summarize_upload_video(file: UploadFile = File(...)) -> Dict[str, Any]:
"""
Upload video/audio,
transcribe with Whisper → summarize with BART.
"""
if not file.filename.lower().endswith((".mp4", ".mov", ".mkv", ".m4a", ".wav")):
raise HTTPException(status_code=400, detail="Only video/audio formats are accepted.")
tmp_dir = tempfile.mkdtemp()
temp_path = os.path.join(tmp_dir, file.filename)
try:
with open(temp_path, "wb") as f:
f.write(await file.read())
transcript = transcribe_uploaded_video(temp_path)
model = get_summarizer()
summary = summarize_text(transcript, model)
if not summary:
raise HTTPException(status_code=500, detail="Summarization failed.")
return {"summary": summary}
finally:
try:
if os.path.exists(temp_path):
os.remove(temp_path)
os.rmdir(tmp_dir)
except Exception:
pass
@app.post("/summarize-news")
async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
url = str(payload.url)
logger.info("Received news summarization request for %s", url)
# ⛔️ DOMAIN CHECK REMOVED — now accepts any domain
model = get_summarizer()
article_text = extract_article_content(url)
if not article_text or len(article_text.split()) < 40:
raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")
summary = summarize_text(article_text, model)
if not summary:
raise HTTPException(status_code=500, detail="Summarization failed.")
return {"summary": summary} |