File size: 3,964 Bytes
d7f53b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import logging
import os
import re
import tempfile
from typing import List
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup
from readability import Document
from newspaper import Article


ALLOWED_DOMAINS = {
    "cnn.com",
    "www.cnn.com",
    "edition.cnn.com",
    "nbcnews.com",
    "www.nbcnews.com",
    "bbc.com",
    "www.bbc.com",
    "bbc.co.uk",
    "www.bbc.co.uk",
}


def is_valid_news_url(url: str) -> bool:
    try:
        parsed = urlparse(url)
        return parsed.scheme in {"http", "https"} and parsed.netloc.lower() in ALLOWED_DOMAINS
    except Exception:
        logging.exception("URL validation failed for %s", url)
        return False


def clean_html(raw_html: str) -> str:
    soup = BeautifulSoup(raw_html or "", "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.extract()
    text = soup.get_text(" ", strip=True)
    return clean_text(text)


def clean_text(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def extract_article_content(url: str) -> str:
    article_text = ""
    try:
        article = Article(url)
        article.download()
        article.parse()
        article_text = clean_text(article.text)
        logging.info("Article scraped via newspaper3k")
    except Exception:
        logging.exception("Primary article scrape failed, falling back to readability/BeautifulSoup")

    if article_text:
        return article_text

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, timeout=12, headers=headers)
        response.raise_for_status()
        html = response.text
        document = Document(html)
        article_text = clean_html(document.summary())
        if not article_text:
            soup = BeautifulSoup(html, "html.parser")
            paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
            article_text = clean_text(" ".join(paragraphs))
    except Exception:
        logging.exception("Fallback scraping failed")

    return article_text


def chunk_text(text: str, max_words: int = 800) -> List[str]:
    words = text.split()
    if not words:
        return []
    chunks: List[str] = []
    for i in range(0, len(words), max_words):
        chunks.append(" ".join(words[i : i + max_words]))
    return chunks


def summarize_text(text: str, summarizer) -> str:
    chunks = chunk_text(text)
    if not chunks:
        return ""

    partial_summaries: List[str] = []
    for chunk in chunks:
        try:
            summary = summarizer(
                chunk,
                max_length=300,
                min_length=120,
                do_sample=False,
                truncation=True,
            )[0]["summary_text"]
            partial_summaries.append(clean_text(summary))
        except Exception:
            logging.exception("Summarization failed for chunk")

    merged = clean_text(" ".join(partial_summaries))
    if not merged:
        return ""

    if len(partial_summaries) == 1:
        return merged

    try:
        final_summary = summarizer(
            merged,
            max_length=300,
            min_length=120,
            do_sample=False,
            truncation=True,
        )[0]["summary_text"]
        return clean_text(final_summary)
    except Exception:
        logging.exception("Final summarization merge failed")
        return merged


def find_first_wav(path: str) -> str:
    if os.path.isfile(path) and path.lower().endswith(".wav"):
        return path
    if os.path.isdir(path):
        for entry in os.listdir(path):
            candidate = os.path.join(path, entry)
            if os.path.isfile(candidate) and candidate.lower().endswith(".wav"):
                return candidate
    return ""


def temp_audio_path() -> str:
    directory = tempfile.mkdtemp(prefix="yt_audio_")
    return os.path.join(directory, "audio.%(ext)s")