Spaces:

brotoo
/

SUMA

Sleeping

File size: 5,881 Bytes

d7f53b3
 
 
d51a2c0
d7f53b3
 
 
 
 
d51a2c0
d7f53b3
 
 
 
d51a2c0
d7f53b3
 
 
 
 
 
 
 
 
 
d51a2c0
d7f53b3
 
 
 
 
 
 
 
d51a2c0
d7f53b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d51a2c0
 
d7f53b3
 
 
 
d51a2c0
 
 
d7f53b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7782db
d7f53b3
 
 
 
d51a2c0
d7f53b3
 
 
 
 
 
 
 
 
 
 
 
 
 
d51a2c0
d7f53b3
d51a2c0
d7f53b3
d51a2c0
d7f53b3
d51a2c0
d7f53b3
 
 
 
 
 
 
 
d51a2c0
d7f53b3
 
 
 
 
 
 
d51a2c0
d7f53b3
 
 
 
d51a2c0
d7f53b3
d51a2c0
d7f53b3
 
 
d51a2c0
 
 
 
 
 
 
b7782db
d51a2c0
 
 
 
 
 
 
 
 
 
 
 
d7f53b3
 
 
 
 
 
 
 
 
d51a2c0
 
 
b7782db
d51a2c0
 
 
 
d7f53b3
d51a2c0
 
d7f53b3
d51a2c0
 
 
d7f53b3
d51a2c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7f53b3
b7782db
d7f53b3
 
d51a2c0
 
 
b7782db
d7f53b3
d51a2c0
 
 
d7f53b3
 
d51a2c0
d7f53b3
 
 
 
d51a2c0

import logging
import os
import re
import tempfile
from typing import Any, Dict, List

import requests
import uvicorn
from bs4 import BeautifulSoup
from fastapi import FastAPI, HTTPException, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, HttpUrl
from readability import Document
from transformers import pipeline
import whisper

os.environ.setdefault("HF_HOME", "/data/hf_cache")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
)
logger = logging.getLogger("app")

summarizer = None
whisper_model = None

MODEL_NAME = "brotoo/BART-NewsSummarizer"


class SummarizeNewsRequest(BaseModel):
    url: HttpUrl


# === utility clean text ===

def clean_text(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def clean_html(raw_html: str) -> str:
    soup = BeautifulSoup(raw_html or "", "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.extract()
    return clean_text(soup.get_text(" ", strip=True))


# === NEWS HANDLER ===

def extract_article_content(url: str) -> str:
    article_text = ""
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        res = requests.get(url, timeout=12, headers=headers)
        res.raise_for_status()
        html = res.text
        document = Document(html)
        article_text = clean_html(document.summary())
        if not article_text:
            soup = BeautifulSoup(html, "html.parser")
            paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
            article_text = clean_text(" ".join(paragraphs))
    except Exception:
        logger.exception("Article scraping failed")
    return article_text


def chunk_text(text: str, max_words: int = 800) -> List[str]:
    words = text.split()
    if not words:
        return []
    return [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]


def summarize_text(text: str, model_pipeline) -> str:
    chunks = chunk_text(text)
    partials = []
    for chunk in chunks:
        try:
            summary = model_pipeline(
                chunk,
                max_length=300,
                min_length=120,
                num_beams=4,
                no_repeat_ngram_size=3,
                do_sample=False,
                truncation=True,
            )[0]["summary_text"]
            partials.append(clean_text(summary))
        except Exception:
            logger.exception("Summarization failed for chunk")

    merged = clean_text(" ".join(partials))
    if len(partials) <= 1:
        return merged

    try:
        final = model_pipeline(
            merged,
            max_length=300,
            min_length=120,
            num_beams=4,
            no_repeat_ngram_size=3,
            do_sample=False,
            truncation=True,
        )[0]["summary_text"]
        return clean_text(final)
    except Exception:
        return merged


def get_summarizer():
    global summarizer
    if summarizer is None:
        logger.info("Loading summarization model...")
        summarizer = pipeline(
            "summarization",
            model=MODEL_NAME,
            tokenizer=MODEL_NAME,
            device=-1
        )
        logger.info("Summarizer ready")
    return summarizer


# === WHISPER TRANSCRIPTION FOR DIRECT FILE UPLOAD ===

def transcribe_uploaded_video(file_path: str) -> str:
    global whisper_model
    if whisper_model is None:
        model_name = os.getenv("WHISPER_MODEL", "small")
        logger.info("Loading Whisper model...")
        whisper_model = whisper.load_model(model_name)

    result = whisper_model.transcribe(file_path, fp16=False)
    text = clean_text(result.get("text", ""))
    if not text:
        raise HTTPException(status_code=500, detail="Whisper transcription failed (empty text).")
    return text


# === FASTAPI APP ===

app = FastAPI(title="News and Video Summarizer", version="2.0")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


@app.post("/summarize-upload-video")
async def summarize_upload_video(file: UploadFile = File(...)) -> Dict[str, Any]:
    """
    Upload video/audio,
    transcribe with Whisper → summarize with BART.
    """
    if not file.filename.lower().endswith((".mp4", ".mov", ".mkv", ".m4a", ".wav")):
        raise HTTPException(status_code=400, detail="Only video/audio formats are accepted.")

    tmp_dir = tempfile.mkdtemp()
    temp_path = os.path.join(tmp_dir, file.filename)

    try:
        with open(temp_path, "wb") as f:
            f.write(await file.read())

        transcript = transcribe_uploaded_video(temp_path)
        model = get_summarizer()

        summary = summarize_text(transcript, model)
        if not summary:
            raise HTTPException(status_code=500, detail="Summarization failed.")
        return {"summary": summary}

    finally:
        try:
            if os.path.exists(temp_path):
                os.remove(temp_path)
            os.rmdir(tmp_dir)
        except Exception:
            pass


@app.post("/summarize-news")
async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
    url = str(payload.url)
    logger.info("Received news summarization request for %s", url)

    # ⛔️ DOMAIN CHECK REMOVED — now accepts any domain

    model = get_summarizer()

    article_text = extract_article_content(url)
    if not article_text or len(article_text.split()) < 40:
        raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")

    summary = summarize_text(article_text, model)
    if not summary:
        raise HTTPException(status_code=500, detail="Summarization failed.")

    return {"summary": summary}