Spaces:

brotoo
/

SUMA

Sleeping

App Files Files Community

brotoo commited on Dec 9, 2025

Commit

d51a2c0

verified ·

1 Parent(s): e43aa6e

Upload 2 files

Browse files

Files changed (2) hide show

app.py +86 -142
requirements.txt +9 -8

app.py CHANGED Viewed

@@ -1,25 +1,19 @@
 import logging
 import os
 import re
 from typing import Any, Dict, List
-from urllib.parse import parse_qs, urlparse
 import requests
 import uvicorn
 from bs4 import BeautifulSoup
-from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, HttpUrl
 from readability import Document
 from transformers import pipeline
-from youtube_transcript_api import (
-    NoTranscriptFound,
-    TranscriptsDisabled,
-    VideoUnavailable,
-    YouTubeTranscriptApi,
-)
-# Optional cache dir to avoid re-downloading models on restarts
 os.environ.setdefault("HF_HOME", "/data/hf_cache")
 logging.basicConfig(
@@ -28,40 +22,22 @@ logging.basicConfig(
 )
 logger = logging.getLogger("app")
-# Globals for lazy loading
 summarizer = None
 MODEL_NAME = "brotoo/BART-NewsSummarizer"
 ALLOWED_DOMAINS = {
-    "cnn.com",
-    "www.cnn.com",
-    "edition.cnn.com",
-    "nbcnews.com",
-    "www.nbcnews.com",
-    "bbc.com",
-    "www.bbc.com",
-    "bbc.co.uk",
-    "www.bbc.co.uk",
 }
 class SummarizeNewsRequest(BaseModel):
     url: HttpUrl
-class SummarizeVideoRequest(BaseModel):
-    url: HttpUrl
-def is_valid_news_url(url: str) -> bool:
-    try:
-        parsed = urlparse(url)
-        return parsed.scheme in {"http", "https"} and parsed.netloc.lower() in ALLOWED_DOMAINS
-    except Exception:
-        logger.exception("URL validation failed for %s", url)
-        return False
 def clean_text(text: str) -> str:
     if not text:
@@ -77,20 +53,22 @@ def clean_html(raw_html: str) -> str:
     return clean_text(soup.get_text(" ", strip=True))
 def extract_article_content(url: str) -> str:
     article_text = ""
     try:
         headers = {"User-Agent": "Mozilla/5.0"}
-        response = requests.get(url, timeout=12, headers=headers)
-        response.raise_for_status()
-        html = response.text
         document = Document(html)
         article_text = clean_html(document.summary())
         if not article_text:
             soup = BeautifulSoup(html, "html.parser")
             paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
             article_text = clean_text(" ".join(paragraphs))
-        logger.info("Article scraped with readability/BeautifulSoup")
     except Exception:
         logger.exception("Article scraping failed")
     return article_text
@@ -100,14 +78,12 @@ def chunk_text(text: str, max_words: int = 800) -> List[str]:
     words = text.split()
     if not words:
         return []
-    return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]
 def summarize_text(text: str, model_pipeline) -> str:
     chunks = chunk_text(text)
-    if not chunks:
-        return ""
-    partials: List[str] = []
     for chunk in chunks:
         try:
             summary = model_pipeline(
@@ -122,13 +98,13 @@ def summarize_text(text: str, model_pipeline) -> str:
             partials.append(clean_text(summary))
         except Exception:
             logger.exception("Summarization failed for chunk")
     merged = clean_text(" ".join(partials))
-    if not merged:
-        return ""
-    if len(partials) == 1:
         return merged
     try:
-        final_summary = model_pipeline(
             merged,
             max_length=300,
             min_length=120,
@@ -137,77 +113,45 @@ def summarize_text(text: str, model_pipeline) -> str:
             do_sample=False,
             truncation=True,
         )[0]["summary_text"]
-        return clean_text(final_summary)
     except Exception:
-        logger.exception("Final summarization merge failed")
         return merged
 def get_summarizer():
     global summarizer
     if summarizer is None:
-        logger.info("Loading summarization model: %s", MODEL_NAME)
         summarizer = pipeline(
             "summarization",
             model=MODEL_NAME,
             tokenizer=MODEL_NAME,
-            device=-1,  # CPU
         )
-        logger.info("Summarization model loaded")
     return summarizer
-def extract_youtube_video_id(url: str) -> str:
-    parsed = urlparse(url)
-    host = (parsed.hostname or "").lower()
-    if host in {"youtu.be"}:
-        return parsed.path.lstrip("/").split("/")[0]
-    if "youtube.com" in host:
-        query_params = parse_qs(parsed.query)
-        video_id = query_params.get("v", [""])[0]
-        if not video_id and parsed.path.startswith("/shorts/"):
-            video_id = parsed.path.split("/shorts/", 1)[-1].split("/")[0]
-        return video_id
-    return ""
-def extract_youtube_transcript(url: str) -> str:
-    import socket
-    socket.gethostbyname("www.youtube.com")
-    video_id = extract_youtube_video_id(url)
-    if not video_id:
-        raise HTTPException(status_code=400, detail="Invalid YouTube URL.")
-    try:
-        # Support multiple youtube-transcript-api versions.
-        if hasattr(YouTubeTranscriptApi, "get_transcript"):
-            transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en", "id"])
-        elif hasattr(YouTubeTranscriptApi, "list_transcripts"):
-            transcripts = YouTubeTranscriptApi.list_transcripts(video_id)
-            transcript = transcripts.find_transcript(["en", "id"]).fetch()
-        else:
-            raise HTTPException(
-                status_code=500,
-                detail="youtube-transcript-api version is too old; please upgrade to >=0.6.0.",
-            )
-        text = " ".join(segment.get("text", "") for segment in transcript)
-        cleaned = clean_text(text)
-        if not cleaned:
-            raise HTTPException(status_code=500, detail="Transcript was empty.")
-        return cleaned
-    except (NoTranscriptFound, TranscriptsDisabled) as exc:
-        logger.exception("Transcript unavailable for video %s", video_id)
-        raise HTTPException(status_code=404, detail=f"Transcript not available: {exc}") from exc
-    except VideoUnavailable as exc:
-        logger.exception("Video unavailable: %s", video_id)
-        raise HTTPException(status_code=404, detail=f"Video unavailable: {exc}") from exc
-    except HTTPException:
-        raise
-    except Exception as exc:
-        logger.exception("Failed to fetch YouTube transcript")
-        raise HTTPException(status_code=500, detail=f"Could not fetch transcript: {exc}")
-app = FastAPI(title="News and Video Summarizer", version="1.0.0")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -217,59 +161,59 @@ app.add_middleware(
 )
-@app.get("/")
-async def root() -> Dict[str, str]:
-    return {"status": "ok", "message": "API is running"}
-@app.get("/health")
-async def health() -> Dict[str, str]:
-    return {"status": "healthy"}
 @app.post("/summarize-news")
 async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
-    logger.info("Received news summarization request for %s", payload.url)
-    if not is_valid_news_url(str(payload.url)):
         raise HTTPException(status_code=400, detail="Unsupported news domain.")
-    try:
-        model = get_summarizer()
-    except Exception as exc:
-        logger.exception("Failed to load summarizer")
-        return {"error": f"Model load failed: {exc}"}
-    article_text = extract_article_content(str(payload.url))
     if not article_text or len(article_text.split()) < 40:
         raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")
     summary = summarize_text(article_text, model)
     if not summary:
         raise HTTPException(status_code=500, detail="Summarization failed.")
-    return {"summary": summary}
-@app.post("/summarize-video")
-async def summarize_video(payload: SummarizeVideoRequest) -> Dict[str, Any]:
-    logger.info("Received video summarization request for %s", payload.url)
-    if not any(host in str(payload.url) for host in ["youtube.com", "youtu.be"]):
-        raise HTTPException(status_code=400, detail="Only YouTube links are supported.")
-    try:
-        model = get_summarizer()
-    except Exception as exc:
-        logger.exception("Failed to load summarizer")
-        return {"error": f"Model load failed: {exc}"}
-    try:
-        transcript_text = extract_youtube_transcript(str(payload.url))
-        summary = summarize_text(transcript_text, model)
-        if not summary:
-            raise HTTPException(status_code=500, detail="Summarization failed.")
-        return {"summary": summary}
-    except HTTPException:
-        raise
-    except Exception as exc:
-        logger.exception("Unexpected error during video summarization")
-        return {"error": f"Video summarization failed: {exc}"}
-if __name__ == "__main__":
-    uvicorn.run("app:app", host="0.0.0.0", port=7860, workers=1)

 import logging
 import os
 import re
+import tempfile
 from typing import Any, Dict, List
 import requests
 import uvicorn
 from bs4 import BeautifulSoup
+from fastapi import FastAPI, HTTPException, UploadFile, File
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, HttpUrl
 from readability import Document
 from transformers import pipeline
+import whisper
 os.environ.setdefault("HF_HOME", "/data/hf_cache")
 logging.basicConfig(
 )
 logger = logging.getLogger("app")
 summarizer = None
+whisper_model = None
 MODEL_NAME = "brotoo/BART-NewsSummarizer"
 ALLOWED_DOMAINS = {
+    "cnn.com", "www.cnn.com", "edition.cnn.com",
+    "nbcnews.com", "www.nbcnews.com",
+    "bbc.com", "www.bbc.com", "bbc.co.uk", "www.bbc.co.uk",
 }
 class SummarizeNewsRequest(BaseModel):
     url: HttpUrl
+# === utility clean text ===
 def clean_text(text: str) -> str:
     if not text:
     return clean_text(soup.get_text(" ", strip=True))
+# === NEWS HANDLER ===
 def extract_article_content(url: str) -> str:
     article_text = ""
     try:
         headers = {"User-Agent": "Mozilla/5.0"}
+        res = requests.get(url, timeout=12, headers=headers)
+        res.raise_for_status()
+        html = res.text
         document = Document(html)
         article_text = clean_html(document.summary())
         if not article_text:
+            from bs4 import BeautifulSoup
             soup = BeautifulSoup(html, "html.parser")
             paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
             article_text = clean_text(" ".join(paragraphs))
     except Exception:
         logger.exception("Article scraping failed")
     return article_text
     words = text.split()
     if not words:
         return []
+    return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
 def summarize_text(text: str, model_pipeline) -> str:
     chunks = chunk_text(text)
+    partials = []
     for chunk in chunks:
         try:
             summary = model_pipeline(
             partials.append(clean_text(summary))
         except Exception:
             logger.exception("Summarization failed for chunk")
     merged = clean_text(" ".join(partials))
+    if len(partials) <= 1:
         return merged
     try:
+        final = model_pipeline(
             merged,
             max_length=300,
             min_length=120,
             do_sample=False,
             truncation=True,
         )[0]["summary_text"]
+        return clean_text(final)
     except Exception:
         return merged
 def get_summarizer():
     global summarizer
     if summarizer is None:
+        logger.info("Loading summarization model...")
         summarizer = pipeline(
             "summarization",
             model=MODEL_NAME,
             tokenizer=MODEL_NAME,
+            device=-1
         )
+        logger.info("Summarizer ready")
     return summarizer
+# === WHISPER TRANSCRIPTION FOR DIRECT FILE UPLOAD ===
+def transcribe_uploaded_video(file_path: str) -> str:
+    global whisper_model
+    if whisper_model is None:
+        model_name = os.getenv("WHISPER_MODEL", "small")
+        logger.info("Loading Whisper model...")
+        whisper_model = whisper.load_model(model_name)   # CPU
+    result = whisper_model.transcribe(file_path, fp16=False)
+    text = clean_text(result.get("text", ""))
+    if not text:
+        raise HTTPException(status_code=500, detail="Whisper transcription failed (empty text).")
+    return text
+# === FASTAPI APP ===
+app = FastAPI(title="News and Video Summarizer", version="2.0")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
 )
+@app.post("/summarize-upload-video")
+async def summarize_upload_video(file: UploadFile = File(...)) -> Dict[str, Any]:
+    """
+    Upload video directly (mp4/mov/mkv/m4a/wav),
+    transcribe with Whisper → summarize with BART.
+    """
+    if not file.filename.lower().endswith((".mp4", ".mov", ".mkv", ".m4a", ".wav")):
+        raise HTTPException(status_code=400, detail="Only video/audio formats are accepted.")
+    tmp_dir = tempfile.mkdtemp()
+    temp_path = os.path.join(tmp_dir, file.filename)
+    try:
+        with open(temp_path, "wb") as f:
+            f.write(await file.read())
+        transcript = transcribe_uploaded_video(temp_path)
+        model = get_summarizer()
+        summary = summarize_text(transcript, model)
+        if not summary:
+            raise HTTPException(status_code=500, detail="Summarization failed.")
+        return {"summary": summary}
+    finally:
+        try:
+            if os.path.exists(temp_path):
+                os.remove(temp_path)
+            os.rmdir(tmp_dir)
+        except Exception:
+            pass
 @app.post("/summarize-news")
 async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
+    url = str(payload.url)
+    logger.info("Received news summarization request for %s", url)
+    # Validasi domain
+    parsed = requests.utils.urlparse(url)
+    if parsed.netloc not in ALLOWED_DOMAINS:
         raise HTTPException(status_code=400, detail="Unsupported news domain.")
+    # Load model
+    model = get_summarizer()
+    # Ekstrak artikel
+    article_text = extract_article_content(url)
     if not article_text or len(article_text.split()) < 40:
         raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")
+    # Summarize
     summary = summarize_text(article_text, model)
     if not summary:
         raise HTTPException(status_code=500, detail="Summarization failed.")
+    return {"summary": summary}

requirements.txt CHANGED Viewed

@@ -2,11 +2,12 @@ numpy<2
 transformers==4.46.1
 torch==2.2.0+cpu
 --extra-index-url https://download.pytorch.org/whl/cpu
-fastapi>=0.115.0
-uvicorn[standard]>=0.30.0
-readability-lxml>=0.8.1
-beautifulsoup4>=4.12.2
-requests>=2.31.0
-pydantic>=1.10.15
-lxml>=4.9.3
-youtube-transcript-api>=0.6.2

 transformers==4.46.1
 torch==2.2.0+cpu
 --extra-index-url https://download.pytorch.org/whl/cpu
+fastapi
+uvicorn[standard]
+python-multipart
+readability-lxml
+beautifulsoup4
+requests
+pydantic
+lxml
+openai-whisper