| | """URL 文本提取 API""" |
| | import json |
| | import re |
| | from urllib.parse import urlparse |
| | import trafilatura |
| | import requests |
| | from backend.api.utils import handle_api_error |
| |
|
| | |
| | MAX_EXTRACTED_TEXT_LENGTH = 20000 |
| |
|
| |
|
| | def _is_valid_url(url: str) -> bool: |
| | """验证 URL 格式""" |
| | try: |
| | result = urlparse(url) |
| | return all([result.scheme in ['http', 'https'], result.netloc]) |
| | except Exception: |
| | return False |
| |
|
| |
|
| | def _is_local_or_private(url: str) -> bool: |
| | """检查是否为本地或私有网络地址(防止 SSRF 攻击)""" |
| | try: |
| | parsed = urlparse(url) |
| | hostname = parsed.hostname |
| | |
| | if not hostname: |
| | return True |
| | |
| | |
| | if hostname in ['localhost', '127.0.0.1', '::1']: |
| | return True |
| | |
| | |
| | private_patterns = [ |
| | r'^10\.', |
| | r'^172\.(1[6-9]|2[0-9]|3[0-1])\.', |
| | r'^192\.168\.', |
| | r'^169\.254\.', |
| | ] |
| | |
| | for pattern in private_patterns: |
| | if re.match(pattern, hostname): |
| | return True |
| | |
| | return False |
| | except Exception: |
| | return True |
| |
|
| |
|
| | def _format_article_text(metadata: dict) -> str: |
| | """ |
| | 将元数据和正文格式化为类似网页显示的纯文本 |
| | |
| | Args: |
| | metadata: trafilatura 提取的 JSON 数据(已解析为字典) |
| | |
| | Returns: |
| | 格式化后的文章文本 |
| | """ |
| | lines = [] |
| | |
| | |
| | if metadata.get('title'): |
| | lines.append(metadata['title']) |
| | lines.append('') |
| | |
| | |
| | meta_parts = [] |
| | if metadata.get('author'): |
| | meta_parts.append(metadata['author']) |
| | if metadata.get('date'): |
| | meta_parts.append(metadata['date']) |
| | |
| | |
| | if metadata.get('source-hostname'): |
| | meta_parts.append(metadata['source-hostname']) |
| | |
| | |
| |
|
| | if meta_parts: |
| | lines.append(' | '.join(meta_parts)) |
| | lines.append('') |
| | |
| | |
| | if metadata.get('text'): |
| | lines.append(metadata['text']) |
| | |
| | return '\n'.join(lines) |
| |
|
| |
|
| | def fetch_url(fetch_request): |
| | """ |
| | 从 URL 提取文本内容 |
| | |
| | Args: |
| | fetch_request: 包含 url 字段的字典 |
| | |
| | Returns: |
| | (响应字典, 状态码) 元组 |
| | """ |
| | url = fetch_request.get('url', '').strip() |
| | |
| | |
| | if not url: |
| | return { |
| | 'success': False, |
| | 'message': '缺少 URL 参数,请提供 url 字段' |
| | }, 400 |
| | |
| | if not _is_valid_url(url): |
| | return { |
| | 'success': False, |
| | 'message': f'无效的 URL 格式: {url}' |
| | }, 400 |
| | |
| | |
| | if _is_local_or_private(url): |
| | return { |
| | 'success': False, |
| | 'message': '不允许访问本地或私有网络地址' |
| | }, 400 |
| | |
| | |
| | try: |
| | from backend.access_log import log_fetch_url |
| | log_fetch_url(url) |
| | |
| | |
| | headers = { |
| | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', |
| | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
| | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', |
| | 'Accept-Encoding': 'gzip, deflate, br', |
| | 'Connection': 'keep-alive', |
| | 'Upgrade-Insecure-Requests': '1', |
| | } |
| | |
| | |
| | response = requests.get(url, headers=headers, timeout=10, allow_redirects=True) |
| | response.raise_for_status() |
| | |
| | |
| | content_type = response.headers.get('Content-Type', '').lower() |
| | if 'text/html' not in content_type and 'text/xml' not in content_type: |
| | return { |
| | 'success': False, |
| | 'message': f'不支持的内容类型: {content_type},仅支持 HTML/XML 页面' |
| | }, 400 |
| | |
| | |
| | result_json = trafilatura.extract( |
| | response.text, |
| | url=url, |
| | with_metadata=True, |
| | output_format='json' |
| | ) |
| | |
| | if not result_json: |
| | print("⚠️ 无法提取页面内容") |
| | return { |
| | 'success': False, |
| | 'message': '无法从网页中提取文本内容,可能不是文章页面或页面需要验证' |
| | }, 400 |
| | |
| | |
| | metadata = json.loads(result_json) |
| | |
| | |
| | if not metadata.get('text') or not metadata['text'].strip(): |
| | print("⚠️ 提取到元数据但无正文内容") |
| | print("元数据:", json.dumps(metadata, ensure_ascii=False, indent=2)) |
| | return { |
| | 'success': False, |
| | 'message': '无法从网页中提取正文内容' |
| | }, 400 |
| | |
| | |
| | formatted_text = _format_article_text(metadata) |
| | original_char_count = len(formatted_text) |
| | |
| | |
| | message = None |
| | |
| | if original_char_count > MAX_EXTRACTED_TEXT_LENGTH: |
| | formatted_text = formatted_text[:MAX_EXTRACTED_TEXT_LENGTH] |
| | message = f'内容较长,已截断为前 {MAX_EXTRACTED_TEXT_LENGTH} 字符(原始长度: {original_char_count} 字符)' |
| | |
| | char_count = len(formatted_text) |
| | |
| | |
| | |
| | |
| | |
| | metadata_less = metadata.copy() |
| | metadata_less['raw_text'] = '' |
| | metadata_less['text'] = '' |
| | |
| | |
| | return { |
| | 'success': True, |
| | 'text': formatted_text, |
| | 'url': url, |
| | 'char_count': char_count, |
| | 'message': message |
| | }, 200 |
| | |
| | except requests.exceptions.Timeout: |
| | return { |
| | 'success': False, |
| | 'message': '请求超时,请检查网络连接或稍后重试' |
| | }, 400 |
| | except requests.exceptions.RequestException as e: |
| | return { |
| | 'success': False, |
| | 'message': f'无法访问 URL: {str(e)}' |
| | }, 400 |
| | except Exception as e: |
| | error_response = handle_api_error('URL 文本提取失败', e) |
| | return error_response, 500 |
| |
|