File size: 5,881 Bytes
d7f53b3
 
 
d51a2c0
d7f53b3
 
 
 
 
d51a2c0
d7f53b3
 
 
 
d51a2c0
d7f53b3
 
 
 
 
 
 
 
 
 
d51a2c0
d7f53b3
 
 
 
 
 
 
 
d51a2c0
d7f53b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d51a2c0
 
d7f53b3
 
 
 
d51a2c0
 
 
d7f53b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7782db
d7f53b3
 
 
 
d51a2c0
d7f53b3
 
 
 
 
 
 
 
 
 
 
 
 
 
d51a2c0
d7f53b3
d51a2c0
d7f53b3
d51a2c0
d7f53b3
d51a2c0
d7f53b3
 
 
 
 
 
 
 
d51a2c0
d7f53b3
 
 
 
 
 
 
d51a2c0
d7f53b3
 
 
 
d51a2c0
d7f53b3
d51a2c0
d7f53b3
 
 
d51a2c0
 
 
 
 
 
 
b7782db
d51a2c0
 
 
 
 
 
 
 
 
 
 
 
d7f53b3
 
 
 
 
 
 
 
 
d51a2c0
 
 
b7782db
d51a2c0
 
 
 
d7f53b3
d51a2c0
 
d7f53b3
d51a2c0
 
 
d7f53b3
d51a2c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7f53b3
b7782db
d7f53b3
 
d51a2c0
 
 
b7782db
d7f53b3
d51a2c0
 
 
d7f53b3
 
d51a2c0
d7f53b3
 
 
 
d51a2c0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import logging
import os
import re
import tempfile
from typing import Any, Dict, List

import requests
import uvicorn
from bs4 import BeautifulSoup
from fastapi import FastAPI, HTTPException, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, HttpUrl
from readability import Document
from transformers import pipeline
import whisper

os.environ.setdefault("HF_HOME", "/data/hf_cache")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
)
logger = logging.getLogger("app")

summarizer = None
whisper_model = None

MODEL_NAME = "brotoo/BART-NewsSummarizer"


class SummarizeNewsRequest(BaseModel):
    url: HttpUrl


# === utility clean text ===

def clean_text(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def clean_html(raw_html: str) -> str:
    soup = BeautifulSoup(raw_html or "", "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.extract()
    return clean_text(soup.get_text(" ", strip=True))


# === NEWS HANDLER ===

def extract_article_content(url: str) -> str:
    article_text = ""
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        res = requests.get(url, timeout=12, headers=headers)
        res.raise_for_status()
        html = res.text
        document = Document(html)
        article_text = clean_html(document.summary())
        if not article_text:
            soup = BeautifulSoup(html, "html.parser")
            paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
            article_text = clean_text(" ".join(paragraphs))
    except Exception:
        logger.exception("Article scraping failed")
    return article_text


def chunk_text(text: str, max_words: int = 800) -> List[str]:
    words = text.split()
    if not words:
        return []
    return [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]


def summarize_text(text: str, model_pipeline) -> str:
    chunks = chunk_text(text)
    partials = []
    for chunk in chunks:
        try:
            summary = model_pipeline(
                chunk,
                max_length=300,
                min_length=120,
                num_beams=4,
                no_repeat_ngram_size=3,
                do_sample=False,
                truncation=True,
            )[0]["summary_text"]
            partials.append(clean_text(summary))
        except Exception:
            logger.exception("Summarization failed for chunk")

    merged = clean_text(" ".join(partials))
    if len(partials) <= 1:
        return merged

    try:
        final = model_pipeline(
            merged,
            max_length=300,
            min_length=120,
            num_beams=4,
            no_repeat_ngram_size=3,
            do_sample=False,
            truncation=True,
        )[0]["summary_text"]
        return clean_text(final)
    except Exception:
        return merged


def get_summarizer():
    global summarizer
    if summarizer is None:
        logger.info("Loading summarization model...")
        summarizer = pipeline(
            "summarization",
            model=MODEL_NAME,
            tokenizer=MODEL_NAME,
            device=-1
        )
        logger.info("Summarizer ready")
    return summarizer


# === WHISPER TRANSCRIPTION FOR DIRECT FILE UPLOAD ===

def transcribe_uploaded_video(file_path: str) -> str:
    global whisper_model
    if whisper_model is None:
        model_name = os.getenv("WHISPER_MODEL", "small")
        logger.info("Loading Whisper model...")
        whisper_model = whisper.load_model(model_name)

    result = whisper_model.transcribe(file_path, fp16=False)
    text = clean_text(result.get("text", ""))
    if not text:
        raise HTTPException(status_code=500, detail="Whisper transcription failed (empty text).")
    return text


# === FASTAPI APP ===

app = FastAPI(title="News and Video Summarizer", version="2.0")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


@app.post("/summarize-upload-video")
async def summarize_upload_video(file: UploadFile = File(...)) -> Dict[str, Any]:
    """
    Upload video/audio,
    transcribe with Whisper → summarize with BART.
    """
    if not file.filename.lower().endswith((".mp4", ".mov", ".mkv", ".m4a", ".wav")):
        raise HTTPException(status_code=400, detail="Only video/audio formats are accepted.")

    tmp_dir = tempfile.mkdtemp()
    temp_path = os.path.join(tmp_dir, file.filename)

    try:
        with open(temp_path, "wb") as f:
            f.write(await file.read())

        transcript = transcribe_uploaded_video(temp_path)
        model = get_summarizer()

        summary = summarize_text(transcript, model)
        if not summary:
            raise HTTPException(status_code=500, detail="Summarization failed.")
        return {"summary": summary}

    finally:
        try:
            if os.path.exists(temp_path):
                os.remove(temp_path)
            os.rmdir(tmp_dir)
        except Exception:
            pass


@app.post("/summarize-news")
async def summarize_news(payload: SummarizeNewsRequest) -> Dict[str, Any]:
    url = str(payload.url)
    logger.info("Received news summarization request for %s", url)

    # ⛔️ DOMAIN CHECK REMOVED — now accepts any domain

    model = get_summarizer()

    article_text = extract_article_content(url)
    if not article_text or len(article_text.split()) < 40:
        raise HTTPException(status_code=400, detail="Could not extract enough article text to summarize.")

    summary = summarize_text(article_text, model)
    if not summary:
        raise HTTPException(status_code=500, detail="Summarization failed.")

    return {"summary": summary}