Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# ==============================================
|
| 2 |
-
#
|
| 3 |
# ==============================================
|
| 4 |
|
| 5 |
import gradio as gr
|
|
@@ -12,54 +12,37 @@ from typing import Dict, Any
|
|
| 12 |
from fastapi import FastAPI, Request
|
| 13 |
import uvicorn
|
| 14 |
import traceback
|
|
|
|
| 15 |
|
| 16 |
# ==============================================
|
| 17 |
-
#
|
| 18 |
# ==============================================
|
| 19 |
|
| 20 |
-
class
|
| 21 |
-
"""Content extractor
|
| 22 |
|
| 23 |
def __init__(self):
|
| 24 |
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
| 25 |
|
| 26 |
def extract_content(self, url: str) -> Dict[str, Any]:
|
| 27 |
-
"""Extract content with
|
| 28 |
start_time = time.time()
|
| 29 |
|
| 30 |
-
print(f"
|
| 31 |
|
| 32 |
# Ensure URL has protocol
|
| 33 |
if not url.startswith(('http://', 'https://')):
|
| 34 |
url = 'https://' + url
|
| 35 |
|
| 36 |
-
# Clean URL - remove any problematic characters
|
| 37 |
-
try:
|
| 38 |
-
from urllib.parse import quote, urlparse, urlunparse
|
| 39 |
-
parsed = urlparse(url)
|
| 40 |
-
# Only encode the path and query
|
| 41 |
-
encoded_path = quote(parsed.path, safe='/')
|
| 42 |
-
encoded_query = quote(parsed.query, safe='=&')
|
| 43 |
-
url = urlunparse((
|
| 44 |
-
parsed.scheme,
|
| 45 |
-
parsed.netloc,
|
| 46 |
-
encoded_path,
|
| 47 |
-
parsed.params,
|
| 48 |
-
encoded_query,
|
| 49 |
-
parsed.fragment
|
| 50 |
-
))
|
| 51 |
-
except:
|
| 52 |
-
pass # Keep original if encoding fails
|
| 53 |
-
|
| 54 |
# Try multiple strategies
|
| 55 |
strategies = [
|
| 56 |
-
self.
|
| 57 |
-
self.
|
| 58 |
-
self.
|
| 59 |
-
self._try_fallback_request, # Fallback with different settings
|
| 60 |
]
|
| 61 |
|
| 62 |
-
|
|
|
|
| 63 |
|
| 64 |
for i, strategy in enumerate(strategies):
|
| 65 |
try:
|
|
@@ -67,334 +50,383 @@ class RobustContentExtractor:
|
|
| 67 |
result = strategy(url)
|
| 68 |
|
| 69 |
if result.get("success"):
|
| 70 |
-
result
|
| 71 |
-
result
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
except Exception as e:
|
| 76 |
-
last_error = str(e)
|
| 77 |
print(f" Strategy {i+1} failed: {e}")
|
| 78 |
-
time.sleep(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
# All failed
|
| 81 |
return {
|
| 82 |
"success": False,
|
| 83 |
"url": url,
|
| 84 |
-
"error":
|
| 85 |
"execution_time": round(time.time() - start_time, 2),
|
| 86 |
-
"suggestion": "Website
|
| 87 |
}
|
| 88 |
|
| 89 |
-
def
|
| 90 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
try:
|
| 92 |
-
# Use encoded URL for Jina
|
| 93 |
jina_url = f"https://r.jina.ai/{url}"
|
| 94 |
|
| 95 |
-
# Try with very short timeout first
|
| 96 |
response = requests.get(
|
| 97 |
jina_url,
|
| 98 |
-
headers={
|
| 99 |
-
|
| 100 |
-
"User-Agent": self.user_agent
|
| 101 |
-
},
|
| 102 |
-
timeout=12 # Reduced from 15s
|
| 103 |
)
|
| 104 |
|
| 105 |
if response.status_code == 200:
|
| 106 |
content = response.text
|
| 107 |
|
| 108 |
-
#
|
| 109 |
-
|
| 110 |
-
data = json.loads(content)
|
| 111 |
-
if isinstance(data, dict):
|
| 112 |
-
if "content" in data:
|
| 113 |
-
content = data["content"]
|
| 114 |
-
elif "data" in data:
|
| 115 |
-
content = str(data["data"])
|
| 116 |
-
except:
|
| 117 |
-
pass # Keep as plain text
|
| 118 |
-
|
| 119 |
-
# Extract title
|
| 120 |
-
title = self._extract_title_from_text(content)
|
| 121 |
|
| 122 |
-
#
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
return {
|
| 126 |
"success": True,
|
| 127 |
"url": url,
|
| 128 |
-
"title": title[:
|
| 129 |
-
"main_content": cleaned
|
| 130 |
"content_length": len(cleaned),
|
| 131 |
-
"content_preview": cleaned[:
|
| 132 |
"source": "jina_reader",
|
| 133 |
"status": response.status_code
|
| 134 |
}
|
| 135 |
|
| 136 |
return {"success": False, "error": f"Jina status: {response.status_code}"}
|
| 137 |
|
| 138 |
-
except requests.exceptions.Timeout:
|
| 139 |
-
print(f" Jina timeout after 12s, trying next strategy...")
|
| 140 |
-
return {"success": False, "error": "Jina Reader timed out"}
|
| 141 |
except Exception as e:
|
| 142 |
-
print(f" Jina error: {e}")
|
| 143 |
return {"success": False, "error": f"Jina error: {str(e)}"}
|
| 144 |
|
| 145 |
-
def
|
| 146 |
-
"""
|
| 147 |
-
headers_list = [
|
| 148 |
-
{
|
| 149 |
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 150 |
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
| 151 |
-
"Accept-Language": "en-US,en;q=0.9",
|
| 152 |
-
"Accept-Encoding": "gzip, deflate, br",
|
| 153 |
-
"DNT": "1",
|
| 154 |
-
"Connection": "keep-alive",
|
| 155 |
-
"Upgrade-Insecure-Requests": "1",
|
| 156 |
-
"Sec-Fetch-Dest": "document",
|
| 157 |
-
"Sec-Fetch-Mode": "navigate",
|
| 158 |
-
"Sec-Fetch-Site": "none",
|
| 159 |
-
"Sec-Fetch-User": "?1",
|
| 160 |
-
"Cache-Control": "max-age=0",
|
| 161 |
-
},
|
| 162 |
-
{
|
| 163 |
-
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
|
| 164 |
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 165 |
-
},
|
| 166 |
-
{
|
| 167 |
-
"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
|
| 168 |
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 169 |
-
},
|
| 170 |
-
]
|
| 171 |
-
|
| 172 |
-
for i, headers in enumerate(headers_list):
|
| 173 |
-
try:
|
| 174 |
-
print(f" Direct attempt {i+1}...")
|
| 175 |
-
response = requests.get(
|
| 176 |
-
url,
|
| 177 |
-
headers=headers,
|
| 178 |
-
timeout=10,
|
| 179 |
-
allow_redirects=True,
|
| 180 |
-
verify=False # Try without SSL verification
|
| 181 |
-
)
|
| 182 |
-
|
| 183 |
-
print(f" Status: {response.status_code}")
|
| 184 |
-
|
| 185 |
-
if response.status_code == 200:
|
| 186 |
-
html_content = response.text
|
| 187 |
-
|
| 188 |
-
# Extract content
|
| 189 |
-
text_content = self._extract_from_html(html_content)
|
| 190 |
-
cleaned = self._clean_content(text_content)
|
| 191 |
-
|
| 192 |
-
# Extract title
|
| 193 |
-
title = self._extract_title_from_html(html_content)
|
| 194 |
-
|
| 195 |
-
if len(cleaned) > 100:
|
| 196 |
-
return {
|
| 197 |
-
"success": True,
|
| 198 |
-
"url": url,
|
| 199 |
-
"title": title[:300] if title else "Direct extraction",
|
| 200 |
-
"main_content": cleaned[:20000],
|
| 201 |
-
"content_length": len(cleaned),
|
| 202 |
-
"source": f"direct_request_{i+1}",
|
| 203 |
-
"status": response.status_code
|
| 204 |
-
}
|
| 205 |
-
|
| 206 |
-
except requests.exceptions.Timeout:
|
| 207 |
-
print(f" Direct request {i+1} timed out")
|
| 208 |
-
continue
|
| 209 |
-
except Exception as e:
|
| 210 |
-
print(f" Direct request {i+1} error: {e}")
|
| 211 |
-
continue
|
| 212 |
-
|
| 213 |
-
return {"success": False, "error": "All direct attempts failed"}
|
| 214 |
-
|
| 215 |
-
def _try_simple_request(self, url: str) -> Dict[str, Any]:
|
| 216 |
-
"""Simple request with minimal headers"""
|
| 217 |
try:
|
| 218 |
-
|
| 219 |
-
response = requests.get(
|
| 220 |
-
url,
|
| 221 |
-
headers={"User-Agent": "Mozilla/5.0"},
|
| 222 |
-
timeout=8,
|
| 223 |
-
allow_redirects=True,
|
| 224 |
-
verify=False
|
| 225 |
-
)
|
| 226 |
-
|
| 227 |
-
print(f" Simple status: {response.status_code}")
|
| 228 |
|
| 229 |
if response.status_code == 200:
|
| 230 |
-
|
| 231 |
-
text_content = self._extract_from_html(html_content)
|
| 232 |
-
cleaned = self._clean_content(text_content)
|
| 233 |
-
title = self._extract_title_from_html(html_content)
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
"content_length": len(cleaned),
|
| 242 |
-
"source": "simple_request"
|
| 243 |
-
}
|
| 244 |
-
|
| 245 |
-
return {"success": False, "error": f"Status: {response.status_code}"}
|
| 246 |
-
|
| 247 |
-
except Exception as e:
|
| 248 |
-
return {"success": False, "error": str(e)}
|
| 249 |
-
|
| 250 |
-
def _try_fallback_request(self, url: str) -> Dict[str, Any]:
|
| 251 |
-
"""Fallback using alternative methods"""
|
| 252 |
-
try:
|
| 253 |
-
print(" Fallback attempt...")
|
| 254 |
-
|
| 255 |
-
# Try with requests session
|
| 256 |
-
session = requests.Session()
|
| 257 |
-
session.headers.update({
|
| 258 |
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
| 259 |
-
"Accept": "text/html",
|
| 260 |
-
})
|
| 261 |
-
|
| 262 |
-
response = session.get(url, timeout=15, allow_redirects=True, verify=False)
|
| 263 |
-
|
| 264 |
-
if response.status_code == 200:
|
| 265 |
-
html_content = response.text
|
| 266 |
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
return {
|
| 272 |
"success": True,
|
| 273 |
"url": url,
|
| 274 |
-
"title":
|
| 275 |
-
"main_content":
|
| 276 |
-
"content_length": len(
|
| 277 |
-
"source": "
|
| 278 |
-
"status": response.status_code
|
| 279 |
}
|
| 280 |
|
| 281 |
-
return {"success": False, "error":
|
| 282 |
|
| 283 |
except Exception as e:
|
| 284 |
-
return {"success": False, "error":
|
| 285 |
-
|
| 286 |
-
def _simple_text_extraction(self, html_content: str) -> str:
|
| 287 |
-
"""Very simple text extraction"""
|
| 288 |
-
# Remove scripts and styles
|
| 289 |
-
html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
|
| 290 |
-
html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
|
| 291 |
-
|
| 292 |
-
# Extract text between tags
|
| 293 |
-
text = re.sub(r'<[^>]+>', ' ', html_content)
|
| 294 |
-
text = html.unescape(text)
|
| 295 |
-
text = re.sub(r'\s+', ' ', text)
|
| 296 |
-
|
| 297 |
-
return text.strip()
|
| 298 |
|
| 299 |
-
def
|
| 300 |
-
"""Extract
|
| 301 |
-
#
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
|
| 310 |
-
|
| 311 |
-
# Extract text
|
| 312 |
-
text = re.sub(r'<[^>]+>', ' ', html_content)
|
| 313 |
-
text = html.unescape(text)
|
| 314 |
-
|
| 315 |
-
# Clean up
|
| 316 |
-
text = re.sub(r'\s+', ' ', text)
|
| 317 |
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
if title_match:
|
| 325 |
-
title = title_match.group(1)
|
| 326 |
-
title = re.sub(r'\s+', ' ', title).strip()
|
| 327 |
-
title = html.unescape(title)
|
| 328 |
-
if title:
|
| 329 |
-
return title[:200]
|
| 330 |
-
|
| 331 |
-
# Try meta title
|
| 332 |
-
meta_match = re.search(r'<meta[^>]*property=["\']og:title["\'][^>]*content=["\'](.*?)["\']', html_content, re.IGNORECASE)
|
| 333 |
-
if meta_match:
|
| 334 |
-
title = meta_match.group(1)
|
| 335 |
-
title = html.unescape(title).strip()
|
| 336 |
-
if title:
|
| 337 |
-
return title[:200]
|
| 338 |
-
|
| 339 |
-
# Try h1
|
| 340 |
-
h1_match = re.search(r'<h1[^>]*>(.*?)</h1>', html_content, re.IGNORECASE | re.DOTALL)
|
| 341 |
-
if h1_match:
|
| 342 |
-
title = h1_match.group(1)
|
| 343 |
-
title = re.sub(r'<[^>]+>', '', title)
|
| 344 |
-
title = html.unescape(title).strip()
|
| 345 |
-
if title:
|
| 346 |
-
return title[:200]
|
| 347 |
|
| 348 |
return ""
|
| 349 |
|
| 350 |
-
def
|
| 351 |
-
"""
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
r'
|
| 355 |
-
r'
|
| 356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
]
|
| 358 |
|
| 359 |
-
for
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
|
| 366 |
return ""
|
| 367 |
|
| 368 |
-
def
|
| 369 |
-
"""Clean and
|
| 370 |
-
if not
|
| 371 |
return ""
|
| 372 |
|
| 373 |
-
#
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
# Remove control characters
|
| 377 |
-
content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
|
| 378 |
|
| 379 |
-
# Remove
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
# ==============================================
|
| 385 |
# INITIALIZE
|
| 386 |
# ==============================================
|
| 387 |
|
| 388 |
-
extractor =
|
| 389 |
|
| 390 |
# ==============================================
|
| 391 |
# FASTAPI APP
|
| 392 |
# ==============================================
|
| 393 |
|
| 394 |
fastapi_app = FastAPI(
|
| 395 |
-
title="
|
| 396 |
-
description="Extracts content with
|
| 397 |
-
version="
|
| 398 |
)
|
| 399 |
|
| 400 |
from fastapi.middleware.cors import CORSMiddleware
|
|
@@ -411,23 +443,22 @@ fastapi_app.add_middleware(
|
|
| 411 |
@fastapi_app.get("/")
|
| 412 |
async def root():
|
| 413 |
return {
|
| 414 |
-
"service": "
|
| 415 |
-
"version": "
|
| 416 |
-
"description": "Extracts
|
| 417 |
"endpoints": {
|
| 418 |
"GET /": "This info",
|
| 419 |
-
"GET /health": "Health check
|
| 420 |
-
"POST /extract": "Extract content"
|
| 421 |
}
|
| 422 |
}
|
| 423 |
|
| 424 |
@fastapi_app.get("/health")
|
| 425 |
async def health():
|
| 426 |
-
"""Fast health check endpoint for wake-up calls"""
|
| 427 |
return {
|
| 428 |
"status": "healthy",
|
| 429 |
"timestamp": time.time(),
|
| 430 |
-
"service": "
|
| 431 |
}
|
| 432 |
|
| 433 |
@fastapi_app.post("/extract")
|
|
@@ -443,8 +474,8 @@ async def api_extract(request: Request):
|
|
| 443 |
content={"success": False, "error": "URL is required"}
|
| 444 |
)
|
| 445 |
|
| 446 |
-
print(f"
|
| 447 |
-
print(f" Starting
|
| 448 |
|
| 449 |
start_time = time.time()
|
| 450 |
result = extractor.extract_content(url)
|
|
@@ -452,6 +483,7 @@ async def api_extract(request: Request):
|
|
| 452 |
|
| 453 |
print(f" Extraction completed in {elapsed:.2f}s")
|
| 454 |
print(f" Success: {result.get('success')}")
|
|
|
|
| 455 |
|
| 456 |
return result
|
| 457 |
|
|
@@ -466,8 +498,7 @@ async def api_extract(request: Request):
|
|
| 466 |
status_code=500,
|
| 467 |
content={
|
| 468 |
"success": False,
|
| 469 |
-
"error": str(e)
|
| 470 |
-
"traceback": traceback.format_exc()[:500]
|
| 471 |
}
|
| 472 |
)
|
| 473 |
|
|
@@ -478,53 +509,50 @@ async def api_extract(request: Request):
|
|
| 478 |
def gradio_extract(url: str):
|
| 479 |
"""Gradio interface"""
|
| 480 |
if not url:
|
| 481 |
-
return "❌
|
| 482 |
|
| 483 |
result = extractor.extract_content(url)
|
| 484 |
|
| 485 |
if result["success"]:
|
| 486 |
content = result["main_content"]
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
preview = content[:500]
|
| 490 |
-
if len(content) > 500:
|
| 491 |
-
preview += "..."
|
| 492 |
|
| 493 |
output = f"""
|
| 494 |
-
## ✅
|
| 495 |
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
|
| 502 |
-
###
|
| 503 |
-
{
|
| 504 |
"""
|
| 505 |
return output, result
|
| 506 |
else:
|
| 507 |
-
error = result.get("error", "
|
| 508 |
-
return f"## ❌
|
| 509 |
|
| 510 |
# Create Gradio interface
|
| 511 |
gradio_interface = gr.Interface(
|
| 512 |
fn=gradio_extract,
|
| 513 |
inputs=gr.Textbox(
|
| 514 |
-
label="
|
| 515 |
-
placeholder="https://example.com",
|
| 516 |
-
value="https://
|
| 517 |
),
|
| 518 |
outputs=[
|
| 519 |
-
gr.Markdown(label="
|
| 520 |
-
gr.JSON(label="API
|
| 521 |
],
|
| 522 |
-
title="
|
| 523 |
-
description="
|
| 524 |
examples=[
|
| 525 |
-
["https://
|
| 526 |
-
["https://
|
| 527 |
-
["https://
|
| 528 |
]
|
| 529 |
)
|
| 530 |
|
|
@@ -540,16 +568,16 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
|
|
| 540 |
|
| 541 |
if __name__ == "__main__":
|
| 542 |
print("\n" + "="*60)
|
| 543 |
-
print("
|
| 544 |
print("="*60)
|
| 545 |
-
print("
|
| 546 |
-
print("•
|
| 547 |
-
print("•
|
| 548 |
-
print("•
|
| 549 |
print("="*60)
|
| 550 |
-
print("API
|
| 551 |
-
print("• GET /health -
|
| 552 |
-
print("• POST /extract -
|
| 553 |
print("="*60 + "\n")
|
| 554 |
|
| 555 |
uvicorn.run(
|
|
|
|
| 1 |
# ==============================================
|
| 2 |
+
# IMPROVED CONTENT EXTRACTOR FOR NEWS SITES
|
| 3 |
# ==============================================
|
| 4 |
|
| 5 |
import gradio as gr
|
|
|
|
| 12 |
from fastapi import FastAPI, Request
|
| 13 |
import uvicorn
|
| 14 |
import traceback
|
| 15 |
+
from bs4 import BeautifulSoup
|
| 16 |
|
| 17 |
# ==============================================
|
| 18 |
+
# NEWS-SPECIFIC CONTENT EXTRACTOR
|
| 19 |
# ==============================================
|
| 20 |
|
| 21 |
+
class NewsContentExtractor:
|
| 22 |
+
"""Content extractor specifically optimized for news websites"""
|
| 23 |
|
| 24 |
def __init__(self):
|
| 25 |
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
| 26 |
|
| 27 |
def extract_content(self, url: str) -> Dict[str, Any]:
|
| 28 |
+
"""Extract news content with article-focused extraction"""
|
| 29 |
start_time = time.time()
|
| 30 |
|
| 31 |
+
print(f"📰 Extracting news from: {url}")
|
| 32 |
|
| 33 |
# Ensure URL has protocol
|
| 34 |
if not url.startswith(('http://', 'https://')):
|
| 35 |
url = 'https://' + url
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# Try multiple strategies
|
| 38 |
strategies = [
|
| 39 |
+
self._try_direct_extract, # Direct extraction with BeautifulSoup
|
| 40 |
+
self._try_jina_reader, # Jina Reader
|
| 41 |
+
self._try_simple_extract, # Simple fallback
|
|
|
|
| 42 |
]
|
| 43 |
|
| 44 |
+
best_result = None
|
| 45 |
+
best_score = 0
|
| 46 |
|
| 47 |
for i, strategy in enumerate(strategies):
|
| 48 |
try:
|
|
|
|
| 50 |
result = strategy(url)
|
| 51 |
|
| 52 |
if result.get("success"):
|
| 53 |
+
# Score the result based on content quality
|
| 54 |
+
score = self._score_content(result.get("main_content", ""))
|
| 55 |
+
result["score"] = score
|
| 56 |
+
|
| 57 |
+
if score > best_score:
|
| 58 |
+
best_score = score
|
| 59 |
+
best_result = result
|
| 60 |
+
print(f" ✓ Strategy {i+1} score: {score}")
|
| 61 |
|
| 62 |
except Exception as e:
|
|
|
|
| 63 |
print(f" Strategy {i+1} failed: {e}")
|
| 64 |
+
time.sleep(0.5)
|
| 65 |
+
|
| 66 |
+
if best_result and best_score > 10: # Minimum score threshold
|
| 67 |
+
best_result["execution_time"] = round(time.time() - start_time, 2)
|
| 68 |
+
best_result["method"] = "best_extraction"
|
| 69 |
+
return best_result
|
| 70 |
|
| 71 |
+
# All failed or low quality
|
| 72 |
return {
|
| 73 |
"success": False,
|
| 74 |
"url": url,
|
| 75 |
+
"error": "Could not extract quality news content",
|
| 76 |
"execution_time": round(time.time() - start_time, 2),
|
| 77 |
+
"suggestion": "Website might have anti-scraping protection"
|
| 78 |
}
|
| 79 |
|
| 80 |
+
def _try_direct_extract(self, url: str) -> Dict[str, Any]:
|
| 81 |
+
"""Direct extraction with BeautifulSoup for better HTML parsing"""
|
| 82 |
+
try:
|
| 83 |
+
headers = {
|
| 84 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
| 85 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 86 |
+
"Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
|
| 87 |
+
"Accept-Encoding": "gzip, deflate",
|
| 88 |
+
"DNT": "1",
|
| 89 |
+
"Connection": "keep-alive",
|
| 90 |
+
"Upgrade-Insecure-Requests": "1",
|
| 91 |
+
"Sec-Fetch-Dest": "document",
|
| 92 |
+
"Sec-Fetch-Mode": "navigate",
|
| 93 |
+
"Sec-Fetch-Site": "none",
|
| 94 |
+
"Sec-Fetch-User": "?1",
|
| 95 |
+
"Cache-Control": "max-age=0",
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
response = requests.get(url, headers=headers, timeout=15, verify=False)
|
| 99 |
+
|
| 100 |
+
if response.status_code == 200:
|
| 101 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 102 |
+
|
| 103 |
+
# Remove unwanted elements
|
| 104 |
+
for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer',
|
| 105 |
+
'aside', 'form', 'iframe', 'button', 'svg',
|
| 106 |
+
'link', 'meta', 'noscript']):
|
| 107 |
+
unwanted.decompose()
|
| 108 |
+
|
| 109 |
+
# Try to find article content using multiple strategies
|
| 110 |
+
article_text = ""
|
| 111 |
+
|
| 112 |
+
# Strategy 1: Look for article-specific containers
|
| 113 |
+
article_selectors = [
|
| 114 |
+
'article', '.article-content', '.post-content', '.entry-content',
|
| 115 |
+
'.news-content', '.content-area', '.main-content',
|
| 116 |
+
'div[class*="article"]', 'div[class*="content"]',
|
| 117 |
+
'div[class*="post"]', 'div[class*="entry"]',
|
| 118 |
+
'div[itemprop="articleBody"]', 'div[class*="story"]'
|
| 119 |
+
]
|
| 120 |
+
|
| 121 |
+
for selector in article_selectors:
|
| 122 |
+
article = soup.select_one(selector)
|
| 123 |
+
if article:
|
| 124 |
+
article_text = article.get_text(separator='\n', strip=True)
|
| 125 |
+
if len(article_text) > 300: # Minimum content length
|
| 126 |
+
print(f" Found content with selector: {selector}")
|
| 127 |
+
break
|
| 128 |
+
|
| 129 |
+
# Strategy 2: Look for main content by paragraph density
|
| 130 |
+
if len(article_text) < 300:
|
| 131 |
+
all_paragraphs = soup.find_all('p')
|
| 132 |
+
if len(all_paragraphs) > 3:
|
| 133 |
+
article_text = '\n'.join([p.get_text(strip=True) for p in all_paragraphs])
|
| 134 |
+
|
| 135 |
+
# Strategy 3: Extract text from main divs
|
| 136 |
+
if len(article_text) < 300:
|
| 137 |
+
main_divs = soup.find_all(['div', 'section'])
|
| 138 |
+
for div in main_divs:
|
| 139 |
+
text = div.get_text(separator='\n', strip=True)
|
| 140 |
+
# Check if this looks like article content
|
| 141 |
+
if (len(text) > 500 and
|
| 142 |
+
text.count('\n') > 5 and
|
| 143 |
+
not any(word in text.lower() for word in ['cookie', 'privacy', 'copyright', 'advertisement'])):
|
| 144 |
+
article_text = text
|
| 145 |
+
break
|
| 146 |
+
|
| 147 |
+
# Clean and format the text
|
| 148 |
+
if article_text:
|
| 149 |
+
cleaned_text = self._clean_news_content(article_text)
|
| 150 |
+
|
| 151 |
+
# Extract title
|
| 152 |
+
title = self._extract_title(soup)
|
| 153 |
+
if not title:
|
| 154 |
+
title_match = soup.find('title')
|
| 155 |
+
title = title_match.get_text(strip=True) if title_match else "新闻标题"
|
| 156 |
+
|
| 157 |
+
# Extract date if available
|
| 158 |
+
date = self._extract_date(soup)
|
| 159 |
+
|
| 160 |
+
return {
|
| 161 |
+
"success": True,
|
| 162 |
+
"url": url,
|
| 163 |
+
"title": title[:200],
|
| 164 |
+
"date": date,
|
| 165 |
+
"main_content": cleaned_text,
|
| 166 |
+
"content_length": len(cleaned_text),
|
| 167 |
+
"content_preview": cleaned_text[:500] + ("..." if len(cleaned_text) > 500 else ""),
|
| 168 |
+
"source": "direct_extraction",
|
| 169 |
+
"status": response.status_code
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
return {"success": False, "error": f"Status: {response.status_code}"}
|
| 173 |
+
|
| 174 |
+
except Exception as e:
|
| 175 |
+
return {"success": False, "error": f"Direct extract error: {str(e)}"}
|
| 176 |
+
|
| 177 |
+
def _try_jina_reader(self, url: str) -> Dict[str, Any]:
|
| 178 |
+
"""Try Jina Reader"""
|
| 179 |
try:
|
|
|
|
| 180 |
jina_url = f"https://r.jina.ai/{url}"
|
| 181 |
|
|
|
|
| 182 |
response = requests.get(
|
| 183 |
jina_url,
|
| 184 |
+
headers={"Accept": "text/plain"},
|
| 185 |
+
timeout=20
|
|
|
|
|
|
|
|
|
|
| 186 |
)
|
| 187 |
|
| 188 |
if response.status_code == 200:
|
| 189 |
content = response.text
|
| 190 |
|
| 191 |
+
# Clean the content
|
| 192 |
+
cleaned = self._clean_news_content(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
+
# Extract title from Jina response
|
| 195 |
+
title = "Jina提取内容"
|
| 196 |
+
lines = content.split('\n')
|
| 197 |
+
for line in lines[:10]:
|
| 198 |
+
if line.startswith('Title:') or line.startswith('# '):
|
| 199 |
+
title = line.replace('Title:', '').replace('# ', '').strip()
|
| 200 |
+
break
|
| 201 |
|
| 202 |
return {
|
| 203 |
"success": True,
|
| 204 |
"url": url,
|
| 205 |
+
"title": title[:200],
|
| 206 |
+
"main_content": cleaned,
|
| 207 |
"content_length": len(cleaned),
|
| 208 |
+
"content_preview": cleaned[:500] + ("..." if len(cleaned) > 500 else ""),
|
| 209 |
"source": "jina_reader",
|
| 210 |
"status": response.status_code
|
| 211 |
}
|
| 212 |
|
| 213 |
return {"success": False, "error": f"Jina status: {response.status_code}"}
|
| 214 |
|
|
|
|
|
|
|
|
|
|
| 215 |
except Exception as e:
|
|
|
|
| 216 |
return {"success": False, "error": f"Jina error: {str(e)}"}
|
| 217 |
|
| 218 |
+
def _try_simple_extract(self, url: str) -> Dict[str, Any]:
|
| 219 |
+
"""Simple fallback extraction"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
try:
|
| 221 |
+
response = requests.get(url, timeout=10, verify=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
if response.status_code == 200:
|
| 224 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
+
# Get all text
|
| 227 |
+
all_text = soup.get_text(separator='\n', strip=True)
|
| 228 |
+
|
| 229 |
+
# Clean and extract meaningful parts
|
| 230 |
+
lines = all_text.split('\n')
|
| 231 |
+
meaningful_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
+
for line in lines:
|
| 234 |
+
line = line.strip()
|
| 235 |
+
if (len(line) > 20 and
|
| 236 |
+
not any(word in line.lower() for word in ['cookie', 'privacy', 'copyright',
|
| 237 |
+
'advertisement', 'newsletter', 'subscribe',
|
| 238 |
+
'follow us', 'share this']) and
|
| 239 |
+
not re.match(r'^[0-9\.\-\s]+$', line)): # Skip number-only lines
|
| 240 |
+
meaningful_lines.append(line)
|
| 241 |
|
| 242 |
+
cleaned_text = '\n'.join(meaningful_lines[:100]) # Take top 100 lines
|
| 243 |
+
|
| 244 |
+
if len(cleaned_text) > 200:
|
| 245 |
+
title = soup.find('title')
|
| 246 |
+
title_text = title.get_text(strip=True) if title else "新闻内容"
|
| 247 |
+
|
| 248 |
return {
|
| 249 |
"success": True,
|
| 250 |
"url": url,
|
| 251 |
+
"title": title_text[:150],
|
| 252 |
+
"main_content": cleaned_text,
|
| 253 |
+
"content_length": len(cleaned_text),
|
| 254 |
+
"source": "simple_extract"
|
|
|
|
| 255 |
}
|
| 256 |
|
| 257 |
+
return {"success": False, "error": "Simple extraction failed"}
|
| 258 |
|
| 259 |
except Exception as e:
|
| 260 |
+
return {"success": False, "error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
+
def _extract_title(self, soup) -> str:
|
| 263 |
+
"""Extract title from BeautifulSoup object"""
|
| 264 |
+
# Try multiple title sources
|
| 265 |
+
title_sources = [
|
| 266 |
+
soup.find('title'),
|
| 267 |
+
soup.find('h1'),
|
| 268 |
+
soup.find('meta', property='og:title'),
|
| 269 |
+
soup.find('meta', attrs={'name': 'title'}),
|
| 270 |
+
soup.find('h2', class_=re.compile(r'title|heading')),
|
| 271 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
for source in title_sources:
|
| 274 |
+
if source:
|
| 275 |
+
if hasattr(source, 'get'):
|
| 276 |
+
content = source.get('content', '') if source.name == 'meta' else source.get_text(strip=True)
|
| 277 |
+
if content and len(content) > 5 and len(content) < 200:
|
| 278 |
+
return content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
return ""
|
| 281 |
|
| 282 |
+
def _extract_date(self, soup) -> str:
|
| 283 |
+
"""Extract date from BeautifulSoup object"""
|
| 284 |
+
date_patterns = [
|
| 285 |
+
r'\d{4}[-/]\d{2}[-/]\d{2}',
|
| 286 |
+
r'\d{2}[-/]\d{2}[-/]\d{4}',
|
| 287 |
+
r'\d{1,2}\s+\w+\s+\d{4}',
|
| 288 |
+
]
|
| 289 |
+
|
| 290 |
+
# Look in common date locations
|
| 291 |
+
date_selectors = [
|
| 292 |
+
'time',
|
| 293 |
+
'.date',
|
| 294 |
+
'.published',
|
| 295 |
+
'.post-date',
|
| 296 |
+
'.article-date',
|
| 297 |
+
'meta[property="article:published_time"]',
|
| 298 |
+
'meta[name="pubdate"]',
|
| 299 |
+
'meta[name="date"]',
|
| 300 |
]
|
| 301 |
|
| 302 |
+
for selector in date_selectors:
|
| 303 |
+
elements = soup.select(selector)
|
| 304 |
+
for element in elements:
|
| 305 |
+
if element.name == 'meta':
|
| 306 |
+
date_str = element.get('content', '')
|
| 307 |
+
else:
|
| 308 |
+
date_str = element.get_text(strip=True) or element.get('datetime', '')
|
| 309 |
+
|
| 310 |
+
for pattern in date_patterns:
|
| 311 |
+
match = re.search(pattern, date_str)
|
| 312 |
+
if match:
|
| 313 |
+
return match.group()
|
| 314 |
|
| 315 |
return ""
|
| 316 |
|
| 317 |
+
def _clean_news_content(self, text: str) -> str:
|
| 318 |
+
"""Clean and format news content"""
|
| 319 |
+
if not text:
|
| 320 |
return ""
|
| 321 |
|
| 322 |
+
# Remove excessive whitespace
|
| 323 |
+
text = re.sub(r'\s+', ' ', text)
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
+
# Remove common unwanted patterns
|
| 326 |
+
unwanted_patterns = [
|
| 327 |
+
r'adsbygoogle.*?\[\]\]',
|
| 328 |
+
r'ADVERTISEMENT',
|
| 329 |
+
r'Sponsored Content',
|
| 330 |
+
r'Sign up for.*?newsletter',
|
| 331 |
+
r'Subscribe to.*?channel',
|
| 332 |
+
r'Follow us on.*',
|
| 333 |
+
r'Share this.*',
|
| 334 |
+
r'Like us on.*',
|
| 335 |
+
r'Read more.*',
|
| 336 |
+
r'Continue reading.*',
|
| 337 |
+
r'点击这里.*',
|
| 338 |
+
r'更多新闻.*',
|
| 339 |
+
r'相关新闻.*',
|
| 340 |
+
r'热门搜索.*',
|
| 341 |
+
r'大事件.*',
|
| 342 |
+
r'Copyright.*All rights reserved',
|
| 343 |
+
r'本网站.*Cookies',
|
| 344 |
+
r'了解更多.*',
|
| 345 |
+
r'接受.*',
|
| 346 |
+
r'简\s*繁',
|
| 347 |
+
r'登入.*',
|
| 348 |
+
r'下载APP.*',
|
| 349 |
+
r'首页.*最新.*头条.*',
|
| 350 |
+
r'[\*\-\=]{5,}', # Multiple special characters
|
| 351 |
+
]
|
| 352 |
|
| 353 |
+
for pattern in unwanted_patterns:
|
| 354 |
+
text = re.sub(pattern, '', text, flags=re.IGNORECASE)
|
| 355 |
+
|
| 356 |
+
# Remove very short lines (likely navigation)
|
| 357 |
+
lines = text.split('\n')
|
| 358 |
+
cleaned_lines = []
|
| 359 |
+
for line in lines:
|
| 360 |
+
line = line.strip()
|
| 361 |
+
if (len(line) > 15 and
|
| 362 |
+
not line.startswith(('http://', 'https://', 'www.')) and
|
| 363 |
+
not re.match(r'^[\d\s\.\-]+$', line)):
|
| 364 |
+
cleaned_lines.append(line)
|
| 365 |
+
|
| 366 |
+
text = '\n'.join(cleaned_lines)
|
| 367 |
+
|
| 368 |
+
# Remove duplicate consecutive lines
|
| 369 |
+
lines = text.split('\n')
|
| 370 |
+
unique_lines = []
|
| 371 |
+
for i, line in enumerate(lines):
|
| 372 |
+
if i == 0 or line != lines[i-1]:
|
| 373 |
+
unique_lines.append(line)
|
| 374 |
+
|
| 375 |
+
return '\n'.join(unique_lines).strip()
|
| 376 |
+
|
| 377 |
+
def _score_content(self, text: str) -> int:
|
| 378 |
+
"""Score content quality based on various factors"""
|
| 379 |
+
if not text:
|
| 380 |
+
return 0
|
| 381 |
+
|
| 382 |
+
score = 0
|
| 383 |
+
|
| 384 |
+
# Length-based scoring
|
| 385 |
+
length = len(text)
|
| 386 |
+
if length > 1000:
|
| 387 |
+
score += 30
|
| 388 |
+
elif length > 500:
|
| 389 |
+
score += 20
|
| 390 |
+
elif length > 200:
|
| 391 |
+
score += 10
|
| 392 |
+
|
| 393 |
+
# Paragraph count (rough estimate)
|
| 394 |
+
paragraphs = text.count('\n\n') + 1
|
| 395 |
+
if paragraphs > 5:
|
| 396 |
+
score += 20
|
| 397 |
+
elif paragraphs > 3:
|
| 398 |
+
score += 10
|
| 399 |
+
|
| 400 |
+
# News indicators
|
| 401 |
+
news_keywords = ['报道', '新闻', '记者', '警方', '调查', '发生', '表示', '指出',
|
| 402 |
+
'据知', '据了解', '据悉', '事件', '事故', '案件']
|
| 403 |
+
|
| 404 |
+
for keyword in news_keywords:
|
| 405 |
+
if keyword in text:
|
| 406 |
+
score += 2
|
| 407 |
+
|
| 408 |
+
# Penalize for unwanted content
|
| 409 |
+
unwanted_terms = ['cookie', 'privacy', 'copyright', 'advertisement', 'newsletter']
|
| 410 |
+
for term in unwanted_terms:
|
| 411 |
+
if term.lower() in text.lower():
|
| 412 |
+
score -= 5
|
| 413 |
+
|
| 414 |
+
return max(0, score)
|
| 415 |
|
| 416 |
# ==============================================
|
| 417 |
# INITIALIZE
|
| 418 |
# ==============================================
|
| 419 |
|
| 420 |
+
extractor = NewsContentExtractor()
|
| 421 |
|
| 422 |
# ==============================================
|
| 423 |
# FASTAPI APP
|
| 424 |
# ==============================================
|
| 425 |
|
| 426 |
fastapi_app = FastAPI(
|
| 427 |
+
title="News Content Extractor",
|
| 428 |
+
description="Extracts news article content with BeautifulSoup",
|
| 429 |
+
version="3.0"
|
| 430 |
)
|
| 431 |
|
| 432 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 443 |
@fastapi_app.get("/")
|
| 444 |
async def root():
|
| 445 |
return {
|
| 446 |
+
"service": "News Content Extractor",
|
| 447 |
+
"version": "3.0",
|
| 448 |
+
"description": "Extracts news article content using BeautifulSoup",
|
| 449 |
"endpoints": {
|
| 450 |
"GET /": "This info",
|
| 451 |
+
"GET /health": "Health check",
|
| 452 |
+
"POST /extract": "Extract news content"
|
| 453 |
}
|
| 454 |
}
|
| 455 |
|
| 456 |
@fastapi_app.get("/health")
|
| 457 |
async def health():
|
|
|
|
| 458 |
return {
|
| 459 |
"status": "healthy",
|
| 460 |
"timestamp": time.time(),
|
| 461 |
+
"service": "news_extractor"
|
| 462 |
}
|
| 463 |
|
| 464 |
@fastapi_app.post("/extract")
|
|
|
|
| 474 |
content={"success": False, "error": "URL is required"}
|
| 475 |
)
|
| 476 |
|
| 477 |
+
print(f"📰 API Request for news: {url}")
|
| 478 |
+
print(f" Starting at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
| 479 |
|
| 480 |
start_time = time.time()
|
| 481 |
result = extractor.extract_content(url)
|
|
|
|
| 483 |
|
| 484 |
print(f" Extraction completed in {elapsed:.2f}s")
|
| 485 |
print(f" Success: {result.get('success')}")
|
| 486 |
+
print(f" Content length: {result.get('content_length', 0)}")
|
| 487 |
|
| 488 |
return result
|
| 489 |
|
|
|
|
| 498 |
status_code=500,
|
| 499 |
content={
|
| 500 |
"success": False,
|
| 501 |
+
"error": str(e)
|
|
|
|
| 502 |
}
|
| 503 |
)
|
| 504 |
|
|
|
|
| 509 |
def gradio_extract(url: str):
|
| 510 |
"""Gradio interface"""
|
| 511 |
if not url:
|
| 512 |
+
return "❌ 请输入URL", {}
|
| 513 |
|
| 514 |
result = extractor.extract_content(url)
|
| 515 |
|
| 516 |
if result["success"]:
|
| 517 |
content = result["main_content"]
|
| 518 |
+
title = result.get("title", "无标题")
|
| 519 |
+
date = result.get("date", "")
|
|
|
|
|
|
|
|
|
|
| 520 |
|
| 521 |
output = f"""
|
| 522 |
+
## ✅ 提取成功!
|
| 523 |
|
| 524 |
+
**标题:** {title}
|
| 525 |
+
**日期:** {date if date else "未提取到日期"}
|
| 526 |
+
**方法:** {result.get('method', '提取')}
|
| 527 |
+
**时间:** {result['execution_time']}s
|
| 528 |
+
**字符数:** {result['content_length']:,}
|
| 529 |
|
| 530 |
+
### 内容预览:
|
| 531 |
+
{content[:800]}{"..." if len(content) > 800 else ""}
|
| 532 |
"""
|
| 533 |
return output, result
|
| 534 |
else:
|
| 535 |
+
error = result.get("error", "未知错误")
|
| 536 |
+
return f"## ❌ 错误\n\n{error}", result
|
| 537 |
|
| 538 |
# Create Gradio interface
|
| 539 |
gradio_interface = gr.Interface(
|
| 540 |
fn=gradio_extract,
|
| 541 |
inputs=gr.Textbox(
|
| 542 |
+
label="新闻URL",
|
| 543 |
+
placeholder="https://example.com/news",
|
| 544 |
+
value="https://northern.sinchew.com.my/?p=7217886"
|
| 545 |
),
|
| 546 |
outputs=[
|
| 547 |
+
gr.Markdown(label="结果"),
|
| 548 |
+
gr.JSON(label="API响应")
|
| 549 |
],
|
| 550 |
+
title="📰 新闻内容提取器",
|
| 551 |
+
description="使用BeautifulSoup提取新闻文章内容",
|
| 552 |
examples=[
|
| 553 |
+
["https://northern.sinchew.com.my/?p=7217886"],
|
| 554 |
+
["https://www.sinchew.com.my/?p=7234965"],
|
| 555 |
+
["https://example.com"]
|
| 556 |
]
|
| 557 |
)
|
| 558 |
|
|
|
|
| 568 |
|
| 569 |
if __name__ == "__main__":
|
| 570 |
print("\n" + "="*60)
|
| 571 |
+
print("📰 新闻内容提取器 v3.0 启动")
|
| 572 |
print("="*60)
|
| 573 |
+
print("特性:")
|
| 574 |
+
print("• 使用BeautifulSoup进行HTML解析")
|
| 575 |
+
print("• 专门针对新闻网站优化")
|
| 576 |
+
print("• 智能内容评分系统")
|
| 577 |
print("="*60)
|
| 578 |
+
print("API端点:")
|
| 579 |
+
print("• GET /health - 健康检查")
|
| 580 |
+
print("• POST /extract - 提取新闻内容")
|
| 581 |
print("="*60 + "\n")
|
| 582 |
|
| 583 |
uvicorn.run(
|