Spaces:

brotoo
/

SUMA

Sleeping

App Files Files Community

brotoo commited on Dec 8, 2025

Commit

d7f53b3

verified ·

1 Parent(s): e5b518f

Upload 7 files

Browse files

Files changed (7) hide show

Dockerfile +17 -0
README.md +12 -10
app.py +290 -0
news.py +19 -0
requirements.txt +13 -0
utils.py +145 -0
video.py +63 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.10-slim
+WORKDIR /app
+# System deps for readability/yt_dlp/whisper
+RUN apt-get update && \
+    apt-get install -y ffmpeg libxml2 libxslt1.1 libffi-dev && \
+    rm -rf /var/lib/apt/lists/*
+COPY requirements.txt ./requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+COPY app.py ./app.py
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

README.md CHANGED Viewed

@@ -1,11 +1,13 @@
----
-title: SUMA
-emoji: 💻
-colorFrom: pink
-colorTo: blue
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# News and Video Summarizer (FastAPI on Hugging Face Spaces)
+## Features
+- POST `/summarize-news` with `{"url": "<cnn/bbc/nbc link>"}` → JSON summary.
+- POST `/summarize-video` with `{"url": "<youtube link>"}` → transcribe (Whisper base) then summarize.
+- GET `/` returns basic status; GET `/health` returns healthy.
+- CORS open to all origins.
+## Run locally
+```bash
+python -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+python app.py  # or uvicorn app:app --host 0.0.0.0 --port 7860

app.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import logging
+import os
+import re
+import shutil
+import tempfile
+from typing import Any, Dict, List
+from urllib.parse import urlparse
+import requests
+import uvicorn
+from bs4 import BeautifulSoup
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, HttpUrl
+from readability import Document
+from transformers import pipeline
+from yt_dlp import YoutubeDL
+# Optional cache dir to avoid re-downloading models on restarts
+os.environ.setdefault("HF_HOME", "/data/hf_cache")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+)
+logger = logging.getLogger("app")
+# Globals for lazy loading
+summarizer = None
+whisper_model = None
+MODEL_NAME = "brotoo/BART-NewsSummarizer"
+ALLOWED_DOMAINS = {
+    "cnn.com",
+    "www.cnn.com",
+    "edition.cnn.com",
+    "nbcnews.com",
+    "www.nbcnews.com",
+    "bbc.com",
+    "www.bbc.com",
+    "bbc.co.uk",
+    "www.bbc.co.uk",
+}
+class SummarizeNewsRequest(BaseModel):
+    url: HttpUrl
+class SummarizeVideoRequest(BaseModel):
+    url: HttpUrl
+def is_valid_news_url(url: str) -> bool:
+    try:
+        parsed = urlparse(url)
+        return parsed.scheme in {"http", "https"} and parsed.netloc.lower() in ALLOWED_DOMAINS
+    except Exception:
+        logger.exception("URL validation failed for %s", url)
+        return False
+def clean_text(text: str) -> str:
+    if not text:
+        return ""
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+def clean_html(raw_html: str) -> str:
+    soup = BeautifulSoup(raw_html or "", "html.parser")
+    for tag in soup(["script", "style", "noscript"]):
+        tag.extract()
+    return clean_text(soup.get_text(" ", strip=True))
+def extract_article_content(url: str) -> str:
+    article_text = ""
+    try:
+        headers = {"User-Agent": "Mozilla/5.0"}
+        response = requests.get(url, timeout=12, headers=headers)
+        response.raise_for_status()
+        html = response.text
+        document = Document(html)
+        article_text = clean_html(document.summary())
+        if not article_text:
+            soup = BeautifulSoup(html, "html.parser")
+            paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
+            article_text = clean_text(" ".join(paragraphs))
+        logger.info("Article scraped with readability/BeautifulSoup")
+    except Exception:
+        logger.exception("Article scraping failed")
+    return article_text
+def chunk_text(text: str, max_words: int = 800) -> List[str]:
+    words = text.split()
+    if not words:
+        return []
+    return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]
+def summarize_text(text: str, model_pipeline) -> str:
+    chunks = chunk_text(text)
+    if not chunks:
+        return ""
+    partials: List[str] = []
+    for chunk in chunks:
+        try:
+            summary = model_pipeline(
+                chunk,
+                max_length=300,
+                min_length=120,
+                num_beams=4,
+                no_repeat_ngram_size=3,
+                do_sample=False,
+                truncation=True,
+            )[0]["summary_text"]
+            partials.append(clean_text(summary))
+        except Exception:
+            logger.exception("Summarization failed for chunk")
+    merged = clean_text(" ".join(partials))
+    if not merged:
+        return ""
+    if len(partials) == 1:
+        return merged
+    try:
+        final_summary = model_pipeline(
+            merged,
+            max_length=300,
+            min_length=120,
+            num_beams=4,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            truncation=True,
+        )[0]["summary_text"]
+        return clean_text(final_summary)
+    except Exception:
+        logger.exception("Final summarization merge failed")
+        return merged
+def get_summarizer():
+    global summarizer
+    if summarizer is None:
+        logger.info("Loading summarization model: %s", MODEL_NAME)
+        summarizer = pipeline(
+            "summarization",
+            model=MODEL_NAME,
+            tokenizer=MODEL_NAME,
+            device=-1,  # CPU
+        )
+        logger.info("Summarization model loaded")
+    return summarizer
+def get_whisper():
+    global whisper_model
+    if whisper_model is None:
+        logger.info("Loading Whisper model: base")
+        import whisper  # type: ignore
+        whisper_model = whisper.load_model("base", device="cpu")
+        logger.info("Whisper model loaded")
+    return whisper_model
+def temp_audio_path() -> str:
+    directory = tempfile.mkdtemp(prefix="yt_audio_")
+    return os.path.join(directory, "audio.%(ext)s")
+def find_first_wav(path: str) -> str:
+    if os.path.isfile(path) and path.lower().endswith(".wav"):
+        return path
+    if os.path.isdir(path):
+        for entry in os.listdir(path):
+            candidate = os.path.join(path, entry)
+            if os.path.isfile(candidate) and candidate.lower().endswith(".wav"):
+                return candidate
+    return ""
+def download_youtube_audio(url: str) -> str:
+    output_template = temp_audio_path()
+    temp_dir = os.path.dirname(output_template)
+    ydl_opts = {
+        "format": "bestaudio/best",
+        "outtmpl": output_template,
+        "postprocessors": [
+            {
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "wav",
+                "preferredquality": "192",
+            }
+        ],
+        "quiet": True,
+        "no_warnings": True,
+    }
+    with YoutubeDL(ydl_opts) as ydl:
+        ydl.download([url])
+    wav_path = find_first_wav(temp_dir)
+    if not wav_path:
+        raise ValueError("Failed to download or convert YouTube audio.")
+    return wav_path
+app = FastAPI(title="News and Video Summarizer", version="1.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+async def root() -> Dict[str, str]:
+    return {"status": "ok", "message": "API is running"}
+@app.get("/health")
+async def health() -> Dict[str, str]:
+    return {"status": "healthy"}
+@app.post("/summarize-news")
+async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
+    logger.info("Received news summarization request for %s", payload.url)
+    if not is_valid_news_url(str(payload.url)):
+        raise HTTPException(status_code=400, detail="Unsupported news domain.")
+    try:
+        model = get_summarizer()
+    except Exception as exc:
+        logger.exception("Failed to load summarizer")
+        return {"error": f"Model load failed: {exc}"}
+    article_text = extract_article_content(str(payload.url))
+    if not article_text or len(article_text.split()) < 40:
+        raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")
+    summary = summarize_text(article_text, model)
+    if not summary:
+        raise HTTPException(status_code=500, detail="Summarization failed.")
+    return {"summary": summary}
+@app.post("/summarize-video")
+async def summarize_video(payload: SummarizeVideoRequest) -> Dict[str, Any]:
+    logger.info("Received video summarization request for %s", payload.url)
+    if not any(host in str(payload.url) for host in ["youtube.com", "youtu.be"]):
+        raise HTTPException(status_code=400, detail="Only YouTube links are supported.")
+    try:
+        model = get_summarizer()
+    except Exception as exc:
+        logger.exception("Failed to load summarizer")
+        return {"error": f"Model load failed: {exc}"}
+    audio_path = ""
+    temp_dir = ""
+    try:
+        whisper = get_whisper()
+        audio_path = download_youtube_audio(str(payload.url))
+        temp_dir = os.path.dirname(audio_path)
+        transcript = whisper.transcribe(audio_path, language="en")
+        transcript_text = clean_text(transcript.get("text", ""))
+        if not transcript_text:
+            raise HTTPException(status_code=500, detail="No transcript text could be produced from the audio.")
+        summary = summarize_text(transcript_text, model)
+        if not summary:
+            raise HTTPException(status_code=500, detail="Summarization failed.")
+        return {"summary": summary}
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.exception("Unexpected error during video summarization")
+        return {"error": f"Video summarization failed: {exc}"}
+    finally:
+        try:
+            if audio_path and os.path.exists(audio_path):
+                os.remove(audio_path)
+            if temp_dir:
+                shutil.rmtree(temp_dir, ignore_errors=True)
+        except Exception:
+            logger.exception("Failed to clean up temporary audio files")
+if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, workers=1)

news.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import logging
+from utils import extract_article_content, is_valid_news_url, summarize_text
+def summarize_news_article(url: str, summarizer) -> str:
+    if not is_valid_news_url(url):
+        raise ValueError("Unsupported news domain. Only CNN, NBC, or BBC links are allowed.")
+    article_text = extract_article_content(url)
+    if not article_text or len(article_text.split()) < 40:
+        raise ValueError("Could not extract enough article text to summarize.")
+    logging.info("Generating summary for news article")
+    summary = summarize_text(article_text, summarizer)
+    if not summary:
+        raise ValueError("Summarization failed for the provided article.")
+    return summary

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+numpy<2
+transformers==4.46.1
+torch==2.2.0+cpu
+--extra-index-url https://download.pytorch.org/whl/cpu
+fastapi>=0.115.0
+uvicorn[standard]>=0.30.0
+openai-whisper==20231117
+yt_dlp>=2023.11.16
+readability-lxml>=0.8.1
+beautifulsoup4>=4.12.2
+requests>=2.31.0
+pydantic>=1.10.15
+lxml>=4.9.3

utils.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import logging
+import os
+import re
+import tempfile
+from typing import List
+from urllib.parse import urlparse
+import requests
+from bs4 import BeautifulSoup
+from readability import Document
+from newspaper import Article
+ALLOWED_DOMAINS = {
+    "cnn.com",
+    "www.cnn.com",
+    "edition.cnn.com",
+    "nbcnews.com",
+    "www.nbcnews.com",
+    "bbc.com",
+    "www.bbc.com",
+    "bbc.co.uk",
+    "www.bbc.co.uk",
+}
+def is_valid_news_url(url: str) -> bool:
+    try:
+        parsed = urlparse(url)
+        return parsed.scheme in {"http", "https"} and parsed.netloc.lower() in ALLOWED_DOMAINS
+    except Exception:
+        logging.exception("URL validation failed for %s", url)
+        return False
+def clean_html(raw_html: str) -> str:
+    soup = BeautifulSoup(raw_html or "", "html.parser")
+    for tag in soup(["script", "style", "noscript"]):
+        tag.extract()
+    text = soup.get_text(" ", strip=True)
+    return clean_text(text)
+def clean_text(text: str) -> str:
+    if not text:
+        return ""
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+def extract_article_content(url: str) -> str:
+    article_text = ""
+    try:
+        article = Article(url)
+        article.download()
+        article.parse()
+        article_text = clean_text(article.text)
+        logging.info("Article scraped via newspaper3k")
+    except Exception:
+        logging.exception("Primary article scrape failed, falling back to readability/BeautifulSoup")
+    if article_text:
+        return article_text
+    try:
+        headers = {"User-Agent": "Mozilla/5.0"}
+        response = requests.get(url, timeout=12, headers=headers)
+        response.raise_for_status()
+        html = response.text
+        document = Document(html)
+        article_text = clean_html(document.summary())
+        if not article_text:
+            soup = BeautifulSoup(html, "html.parser")
+            paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
+            article_text = clean_text(" ".join(paragraphs))
+    except Exception:
+        logging.exception("Fallback scraping failed")
+    return article_text
+def chunk_text(text: str, max_words: int = 800) -> List[str]:
+    words = text.split()
+    if not words:
+        return []
+    chunks: List[str] = []
+    for i in range(0, len(words), max_words):
+        chunks.append(" ".join(words[i : i + max_words]))
+    return chunks
+def summarize_text(text: str, summarizer) -> str:
+    chunks = chunk_text(text)
+    if not chunks:
+        return ""
+    partial_summaries: List[str] = []
+    for chunk in chunks:
+        try:
+            summary = summarizer(
+                chunk,
+                max_length=300,
+                min_length=120,
+                do_sample=False,
+                truncation=True,
+            )[0]["summary_text"]
+            partial_summaries.append(clean_text(summary))
+        except Exception:
+            logging.exception("Summarization failed for chunk")
+    merged = clean_text(" ".join(partial_summaries))
+    if not merged:
+        return ""
+    if len(partial_summaries) == 1:
+        return merged
+    try:
+        final_summary = summarizer(
+            merged,
+            max_length=300,
+            min_length=120,
+            do_sample=False,
+            truncation=True,
+        )[0]["summary_text"]
+        return clean_text(final_summary)
+    except Exception:
+        logging.exception("Final summarization merge failed")
+        return merged
+def find_first_wav(path: str) -> str:
+    if os.path.isfile(path) and path.lower().endswith(".wav"):
+        return path
+    if os.path.isdir(path):
+        for entry in os.listdir(path):
+            candidate = os.path.join(path, entry)
+            if os.path.isfile(candidate) and candidate.lower().endswith(".wav"):
+                return candidate
+    return ""
+def temp_audio_path() -> str:
+    directory = tempfile.mkdtemp(prefix="yt_audio_")
+    return os.path.join(directory, "audio.%(ext)s")

video.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import logging
+import os
+import shutil
+import tempfile
+from yt_dlp import YoutubeDL
+from utils import clean_text, find_first_wav, summarize_text, temp_audio_path
+def _download_youtube_audio(url: str) -> str:
+    output_template = temp_audio_path()
+    temp_dir = os.path.dirname(output_template)
+    ydl_opts = {
+        "format": "bestaudio/best",
+        "outtmpl": output_template,
+        "postprocessors": [
+            {
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "wav",
+                "preferredquality": "192",
+            }
+        ],
+        "quiet": True,
+        "no_warnings": True,
+    }
+    with YoutubeDL(ydl_opts) as ydl:
+        ydl.download([url])
+    wav_path = find_first_wav(temp_dir)
+    if not wav_path:
+        raise ValueError("Failed to download or convert YouTube audio.")
+    return wav_path
+def summarize_video_url(url: str, summarizer, whisper_model) -> str:
+    if not any(host in url for host in ["youtube.com", "youtu.be"]):
+        raise ValueError("Only YouTube links are supported.")
+    audio_path = ""
+    temp_dir = ""
+    try:
+        audio_path = _download_youtube_audio(url)
+        temp_dir = os.path.dirname(audio_path)
+        logging.info("Transcribing audio with Whisper")
+        transcript = whisper_model.transcribe(audio_path, language="en")
+        transcript_text = clean_text(transcript.get("text", ""))
+        if not transcript_text:
+            raise ValueError("No transcript text could be produced from the audio.")
+        logging.info("Generating summary for video transcript")
+        summary = summarize_text(transcript_text, summarizer)
+        if not summary:
+            raise ValueError("Summarization failed for the provided video.")
+        return summary
+    finally:
+        try:
+            if audio_path and os.path.exists(audio_path):
+                os.remove(audio_path)
+            if temp_dir:
+                shutil.rmtree(temp_dir, ignore_errors=True)
+        except Exception:
+            logging.exception("Failed to clean up temporary audio files")