yukee1992 commited on
Commit
ba2f5fc
·
verified ·
1 Parent(s): 92bcfa2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +334 -467
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==============================================
2
- # SMART CONTENT EXTRACTOR FOR CHINESE WEBSITES
3
  # ==============================================
4
 
5
  import gradio as gr
@@ -8,481 +8,363 @@ import json
8
  import time
9
  import re
10
  import html
11
- import chardet
12
- from typing import Dict, Any, Optional
 
 
 
13
  from fastapi import FastAPI, Request
14
  import uvicorn
15
 
16
  # ==============================================
17
- # ENHANCED CONTENT EXTRACTOR FOR CHINESE
18
  # ==============================================
19
 
20
- class ChineseContentExtractor:
21
- """Enhanced content extractor optimized for Chinese websites"""
22
 
23
  def __init__(self):
24
- # Chinese browser user agents
25
- self.user_agents = [
26
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
27
- "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
28
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/120.0",
29
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 
 
 
 
 
 
 
 
30
  ]
31
 
32
- # Common Chinese website patterns
33
- self.chinese_site_patterns = [
34
- r'\.cn$',
35
- r'\.com\.cn$',
36
- r'baidu\.com',
37
- r'qq\.com',
38
- r'sina\.com\.cn',
39
- r'sohu\.com',
40
- r'163\.com',
41
- r'jd\.com',
42
- r'taobao\.com',
43
- r'alibaba\.com',
44
- r'zhihu\.com',
45
- r'bilibili\.com',
46
- r'weibo\.com',
47
- r'douyin\.com',
48
- r'douban\.com',
49
- r'ximalaya\.com',
50
  ]
51
 
52
- def is_chinese_website(self, url: str) -> bool:
53
- """Check if URL is a Chinese website"""
54
- for pattern in self.chinese_site_patterns:
55
- if re.search(pattern, url, re.IGNORECASE):
56
- return True
57
- return False
58
-
59
  def extract_content(self, url: str) -> Dict[str, Any]:
60
- """Extract content with Chinese website support"""
61
  start_time = time.time()
62
 
63
- print(f"🌐 Extracting content from: {url}")
64
 
65
  # Ensure URL has protocol
66
  if not url.startswith(('http://', 'https://')):
67
  url = 'https://' + url
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  try:
70
- # Determine if Chinese website
71
- is_chinese = self.is_chinese_website(url)
 
 
 
72
 
73
- # Fetch the page with appropriate settings
74
- response = self._fetch_with_encoding(url, is_chinese)
75
- response.raise_for_status()
76
 
77
- # Get correct encoding
78
- content, encoding = self._decode_content(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # Extract main content
81
- main_content = self._extract_main_content(content, is_chinese)
82
 
83
- # Clean content (preserve Chinese characters)
84
- cleaned_content = self._clean_content(main_content, is_chinese)
 
 
 
 
 
 
85
 
86
- # Extract metadata
87
- title = self._extract_title(content, encoding)
88
- metadata = self._extract_metadata(content)
89
 
90
- # Calculate content stats
91
- chinese_char_count = self._count_chinese_characters(cleaned_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- return {
94
- "success": True,
95
- "url": url,
96
- "is_chinese_website": is_chinese,
97
- "title": title,
98
- "main_content": cleaned_content[:25000], # Increased limit for Chinese
99
- "content_length": len(cleaned_content),
100
- "chinese_char_count": chinese_char_count,
101
- "encoding_used": encoding,
102
- "content_preview": cleaned_content[:800] + ("..." if len(cleaned_content) > 800 else ""),
103
- "metadata": metadata,
104
- "status_code": response.status_code,
105
- "execution_time": round(time.time() - start_time, 2)
106
- }
107
 
108
- except Exception as e:
109
- return {
110
- "success": False,
111
- "url": url,
112
- "error": str(e),
113
- "execution_time": round(time.time() - start_time, 2)
114
- }
115
 
116
- def _fetch_with_encoding(self, url: str, is_chinese: bool) -> requests.Response:
117
- """Fetch webpage with proper encoding handling"""
118
- headers = {
119
- 'User-Agent': self.user_agents[0],
120
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
121
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8' if is_chinese else 'en-US,en;q=0.9',
122
- 'Accept-Encoding': 'gzip, deflate',
123
- }
124
 
125
- # Add Chinese-specific headers
126
- if is_chinese:
127
- headers.update({
128
- 'Accept-Language': 'zh-CN,zh;q=0.9',
129
- 'Cache-Control': 'no-cache',
130
- })
 
131
 
132
- response = requests.get(url, headers=headers, timeout=15)
133
- return response
134
 
135
- def _decode_content(self, response: requests.Response) -> tuple[str, str]:
136
- """Decode content with proper encoding detection"""
137
- # Try to detect encoding
138
- if response.encoding:
139
- encoding = response.encoding.lower()
140
- else:
141
- # Use chardet to detect encoding
142
- detected = chardet.detect(response.content)
143
- encoding = detected.get('encoding', 'utf-8').lower()
144
-
145
- # Handle common Chinese encodings
146
- if encoding in ['gb2312', 'gbk', 'gb18030']:
147
- encoding = 'gb18030' # Most comprehensive Chinese encoding
148
- elif encoding == 'big5':
149
- encoding = 'big5' # Traditional Chinese
150
- else:
151
- encoding = 'utf-8' # Default to UTF-8
152
 
153
- try:
154
- content = response.content.decode(encoding, errors='replace')
155
- except:
156
- # Fallback to UTF-8 with error replacement
157
- content = response.content.decode('utf-8', errors='replace')
158
- encoding = 'utf-8'
159
 
160
- return content, encoding
 
 
 
 
 
 
 
 
 
161
 
162
- def _extract_main_content(self, html_content: str, is_chinese: bool) -> str:
163
- """Extract main content with Chinese website optimizations"""
 
 
 
 
164
 
165
- # Remove unwanted sections
166
- html_content = self._remove_unwanted_sections(html_content, is_chinese)
167
 
168
- # Chinese websites often have specific content patterns
169
- content_patterns = [
170
- # Common Chinese content containers
171
- r'<div[^>]*class="[^"]*(content|article|post|正文|内容)[^"]*"[^>]*>(.*?)</div>',
172
- r'<div[^>]*id="[^"]*(content|article|post|正文|内容)[^"]*"[^>]*>(.*?)</div>',
173
 
174
- # Article tags
175
- r'<article[^>]*>(.*?)</article>',
176
-
177
- # Main content area
178
- r'<main[^>]*>(.*?)</main>',
179
-
180
- # Chinese specific patterns
181
- r'<div[^>]*class="[^"]*(detail|content-main|article-content)[^"]*"[^>]*>(.*?)</div>',
182
-
183
- # For news websites
184
- r'<div[^>]*class="[^"]*(news-content|news-body|news-article)[^"]*"[^>]*>(.*?)</div>',
185
- ]
186
 
187
- all_content = []
188
-
189
- for pattern in content_patterns:
190
- matches = re.findall(pattern, html_content, re.DOTALL | re.IGNORECASE)
191
- for match in matches:
192
- if isinstance(match, tuple):
193
- for group in match:
194
- if group and len(group.strip()) > 50:
195
- all_content.append(group)
196
- elif match and len(match.strip()) > 50:
197
- all_content.append(match)
198
-
199
- if all_content:
200
- # Combine all found content
201
- combined = ' '.join(all_content)
202
-
203
- # Remove any remaining HTML tags
204
- combined = re.sub(r'<[^>]+>', ' ', combined)
205
-
206
- # Decode HTML entities
207
- combined = html.unescape(combined)
208
 
209
- return combined
 
 
 
210
 
211
- # Fallback: extract all text and clean
212
- return self._extract_all_text(html_content, is_chinese)
213
 
214
- def _remove_unwanted_sections(self, html_content: str, is_chinese: bool) -> str:
215
- """Remove unwanted sections with Chinese-specific patterns"""
216
-
217
- # Base patterns
218
- unwanted_patterns = [
219
- # Navigation
220
- r'<nav[^>]*>.*?</nav>',
221
- r'<header[^>]*>.*?</header>',
222
 
223
- # Footers
224
- r'<footer[^>]*>.*?</footer>',
225
 
226
- # Sidebars
227
- r'<aside[^>]*>.*?</aside>',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
- # Ads
230
- r'<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>',
231
- r'<ins[^>]*>.*?</ins>',
232
 
233
- # Scripts and styles
234
- r'<script[^>]*>.*?</script>',
235
- r'<style[^>]*>.*?</style>',
236
- r'<!--.*?-->',
237
- ]
238
-
239
- # Chinese-specific unwanted patterns
240
- if is_chinese:
241
- chinese_patterns = [
242
- # Chinese navigation/menus (导航, 菜单)
243
- r'<div[^>]*class="[^"]*(导航|菜单|nav)[^"]*"[^>]*>.*?</div>',
244
- r'<ul[^>]*class="[^"]*(导航|菜单)[^"]*"[^>]*>.*?</ul>',
245
-
246
- # Sidebars (侧边栏)
247
- r'<div[^>]*class="[^"]*(侧边栏|sidebar)[^"]*"[^>]*>.*?</div>',
248
-
249
- # Comments (评论)
250
- r'<div[^>]*class="[^"]*(评论|comment)[^"]*"[^>]*>.*?</div>',
251
-
252
- # Related articles (相关文章)
253
- r'<div[^>]*class="[^"]*(相关|related)[^"]*"[^>]*>.*?</div>',
254
-
255
- # Hot posts (热门)
256
- r'<div[^>]*class="[^"]*(热门|hot)[^"]*"[^>]*>.*?</div>',
257
-
258
- # Recommendations (推荐)
259
- r'<div[^>]*class="[^"]*(推荐|recommend)[^"]*"[^>]*>.*?</div>',
260
-
261
- # Share buttons (分享)
262
- r'<div[^>]*class="[^"]*(分享|share)[^"]*"[^>]*>.*?</div>',
263
- ]
264
- unwanted_patterns.extend(chinese_patterns)
265
-
266
- cleaned_html = html_content
267
- for pattern in unwanted_patterns:
268
- cleaned_html = re.sub(pattern, ' ', cleaned_html, flags=re.DOTALL | re.IGNORECASE)
269
-
270
- return cleaned_html
271
 
272
- def _extract_all_text(self, html_content: str, is_chinese: bool) -> str:
273
- """Extract all text with Chinese character preservation"""
274
- # Remove scripts, styles, comments
275
  html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
276
  html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
277
- html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
278
 
279
  # Remove unwanted tags
280
- unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'ins', 'meta', 'link']
281
  for tag in unwanted_tags:
282
  html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
283
 
284
- # Remove HTML tags but preserve text
285
  text = re.sub(r'<[^>]+>', ' ', html_content)
286
-
287
- # Decode HTML entities
288
  text = html.unescape(text)
289
 
290
- # Chinese-specific cleaning
291
- if is_chinese:
292
- # Keep Chinese text blocks
293
- lines = text.split('\n')
294
- filtered_lines = []
295
- for line in lines:
296
- line = line.strip()
297
- # Keep lines with significant Chinese content
298
- chinese_chars = self._count_chinese_characters(line)
299
- if chinese_chars > 5 or len(line) > 50:
300
- filtered_lines.append(line)
301
- text = '\n\n'.join(filtered_lines)
302
-
303
  return text
304
 
305
- def _clean_content(self, content: str, is_chinese: bool) -> str:
306
- """Clean content while preserving Chinese characters"""
307
  if not content:
308
  return ""
309
 
310
- # Replace multiple whitespace with single space
311
  content = re.sub(r'\s+', ' ', content)
312
 
313
- # Remove control characters but preserve Chinese/Unicode
314
  content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
315
 
316
- # Remove unwanted phrases (both English and Chinese)
317
- unwanted_phrases = [
318
- # English
319
- r'sign up for our newsletter',
320
- r'subscribe to our newsletter',
321
- r'follow us on',
322
- r'share this article',
323
- r'read more',
324
- r'continue reading',
325
- r'advertisement',
326
- r'click here',
327
- r'learn more',
328
-
329
- # Chinese
330
- r'订阅我们的新闻',
331
- r'关注我们',
332
- r'分享这篇文章',
333
- r'阅读更多',
334
- r'继续阅读',
335
- r'广告',
336
- r'点击这里',
337
- r'了解更多',
338
- r'相关文章',
339
- r'热门推荐',
340
- r'猜你喜欢',
341
- ]
342
-
343
- for phrase in unwanted_phrases:
344
- content = re.sub(phrase, '', content, flags=re.IGNORECASE)
345
-
346
- # Remove email addresses and URLs
347
- content = re.sub(r'\S+@\S+\.\S+', '', content)
348
- content = re.sub(r'https?://\S+', '', content)
349
-
350
- # For Chinese content, clean differently
351
- if is_chinese:
352
- # Remove excessive punctuation but preserve Chinese punctuation
353
- content = re.sub(r'[。!?]{3,}', '。', content)
354
- content = re.sub(r'[\.,!?]{3,}', '.', content)
355
-
356
- # Normalize Chinese punctuation spacing
357
- content = re.sub(r'\s+([。,!?;:])', r'\1', content)
358
- content = re.sub(r'([。,!?;:])\s+', r'\1', content)
359
- else:
360
- # Normalize English punctuation spacing
361
- content = re.sub(r'\s+([.,!?;:])', r'\1', content)
362
- content = re.sub(r'([.,!?;:])\s+', r'\1 ', content)
363
-
364
- # Split and filter paragraphs
365
- if is_chinese:
366
- # Split by Chinese sentence endings
367
- sentences = re.split(r'[。!?]', content)
368
- else:
369
- # Split by English sentence endings
370
- sentences = re.split(r'[.!?]', content)
371
-
372
- clean_sentences = []
373
- for sentence in sentences:
374
- sentence = sentence.strip()
375
- if not sentence:
376
- continue
377
-
378
- # Keep sentences with meaningful content
379
- if is_chinese:
380
- chinese_chars = self._count_chinese_characters(sentence)
381
- if chinese_chars > 3 or len(sentence) > 20:
382
- clean_sentences.append(sentence)
383
- else:
384
- if len(sentence) > 20:
385
- clean_sentences.append(sentence)
386
-
387
- # Join back with appropriate punctuation
388
- if is_chinese:
389
- content = '。'.join(clean_sentences) + ('。' if clean_sentences else '')
390
- else:
391
- content = '. '.join(clean_sentences) + ('.' if clean_sentences else '')
392
 
393
  return content.strip()
394
-
395
- def _extract_title(self, html_content: str, encoding: str) -> str:
396
- """Extract page title with encoding support"""
397
- title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
398
- if title_match:
399
- title = title_match.group(1)
400
- title = re.sub(r'\s+', ' ', title).strip()
401
- # Ensure title is properly decoded
402
- try:
403
- title = html.unescape(title)
404
- except:
405
- pass
406
- return title[:300]
407
- return "未找到标题" if 'gb' in encoding or 'big5' in encoding else "No title found"
408
-
409
- def _extract_metadata(self, html_content: str) -> Dict[str, str]:
410
- """Extract metadata including Chinese meta tags"""
411
- metadata = {}
412
-
413
- # Meta description (supports both English and Chinese)
414
- desc_patterns = [
415
- r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
416
- r'<meta[^>]*property=["\']og:description["\'][^>]*content=["\'](.*?)["\']',
417
- ]
418
-
419
- for pattern in desc_patterns:
420
- match = re.search(pattern, html_content, re.IGNORECASE)
421
- if match:
422
- metadata['description'] = html.unescape(match.group(1))[:500]
423
- break
424
-
425
- # Keywords
426
- keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
427
- html_content, re.IGNORECASE)
428
- if keywords_match:
429
- metadata['keywords'] = html.unescape(keywords_match.group(1))[:500]
430
-
431
- # Author
432
- author_match = re.search(r'<meta[^>]*name=["\']author["\'][^>]*content=["\'](.*?)["\']',
433
- html_content, re.IGNORECASE)
434
- if author_match:
435
- metadata['author'] = html.unescape(author_match.group(1))[:200]
436
-
437
- # Charset
438
- charset_match = re.search(r'<meta[^>]*charset=["\']([^"\']+)["\']', html_content, re.IGNORECASE)
439
- if charset_match:
440
- metadata['charset'] = charset_match.group(1)
441
-
442
- return metadata
443
-
444
- def _count_chinese_characters(self, text: str) -> int:
445
- """Count Chinese characters in text"""
446
- # Chinese character ranges in Unicode
447
- chinese_ranges = [
448
- (0x4E00, 0x9FFF), # CJK Unified Ideographs
449
- (0x3400, 0x4DBF), # CJK Unified Ideographs Extension A
450
- (0x20000, 0x2A6DF), # CJK Unified Ideographs Extension B
451
- (0x2A700, 0x2B73F), # CJK Unified Ideographs Extension C
452
- (0x2B740, 0x2B81F), # CJK Unified Ideographs Extension D
453
- (0x2B820, 0x2CEAF), # CJK Unified Ideographs Extension E
454
- (0xF900, 0xFAFF), # CJK Compatibility Ideographs
455
- (0x2F800, 0x2FA1F), # CJK Compatibility Ideographs Supplement
456
- ]
457
-
458
- count = 0
459
- for char in text:
460
- char_code = ord(char)
461
- for start, end in chinese_ranges:
462
- if start <= char_code <= end:
463
- count += 1
464
- break
465
-
466
- return count
467
 
468
  # ==============================================
469
  # INITIALIZE
470
  # ==============================================
471
 
472
- extractor = ChineseContentExtractor()
473
 
474
  # ==============================================
475
  # FASTAPI APP
476
  # ==============================================
477
 
478
- # Create FastAPI app
479
  fastapi_app = FastAPI(
480
- title="智能内容提取器 (中文网站优化)",
481
- description="专门优化中文网站的内容提取器,去除导航、广告、页脚等无关内容",
482
- version="2.0"
483
  )
484
 
485
- # Add CORS middleware
486
  from fastapi.middleware.cors import CORSMiddleware
487
  from fastapi.responses import JSONResponse
488
 
@@ -497,30 +379,29 @@ fastapi_app.add_middleware(
497
  @fastapi_app.get("/")
498
  async def root():
499
  return {
500
- "service": "智能内容提取器",
501
- "version": "2.0",
502
- "description": "专门优化中文网站的内容提取器",
503
  "endpoints": {
504
- "GET /": "API信息",
505
- "GET /health": "健康检查",
506
- "POST /extract": "提取主要内容 (n8n专用)"
507
  },
508
- "examples": {
509
- "中文网站": "https://zhihu.com",
510
- "英文网站": "https://example.com"
511
- }
 
 
512
  }
513
 
514
  @fastapi_app.get("/health")
515
  async def health():
516
- return {
517
- "status": "healthy",
518
- "timestamp": time.time()
519
- }
520
 
521
  @fastapi_app.post("/extract")
522
  async def api_extract(request: Request):
523
- """API endpoint for n8n - optimized for Chinese websites"""
524
  try:
525
  body = await request.json()
526
  url = body.get("url", "").strip()
@@ -528,23 +409,23 @@ async def api_extract(request: Request):
528
  if not url:
529
  return JSONResponse(
530
  status_code=400,
531
- content={"success": False, "error": "URL参数是必需的"}
532
  )
533
 
534
- print(f"📨 内容提取请求: {url}")
535
- result = extractor.extract_content(url)
536
 
537
  return result
538
 
539
  except json.JSONDecodeError:
540
  return JSONResponse(
541
  status_code=400,
542
- content={"success": False, "error": "无效的JSON数据"}
543
  )
544
  except Exception as e:
545
  return JSONResponse(
546
  status_code=500,
547
- content={"success": False, "error": f"内部错误: {str(e)}"}
548
  )
549
 
550
  # ==============================================
@@ -552,79 +433,63 @@ async def api_extract(request: Request):
552
  # ==============================================
553
 
554
  def gradio_extract(url: str):
555
- """Gradio interface function"""
556
  if not url:
557
- return "❌ 请输入URL", {}
558
 
559
- result = extractor.extract_content(url)
560
 
561
  if result["success"]:
562
  content = result["main_content"]
563
  content_length = result["content_length"]
564
- is_chinese = result.get("is_chinese_website", False)
565
-
566
- # Create preview
567
- if is_chinese:
568
- # For Chinese, show first 600 characters
569
- preview = content[:600]
570
- if len(content) > 600:
571
- preview += "..."
572
- else:
573
- # For English, show first 500 characters
574
- preview = content[:500]
575
- if len(content) > 500:
576
- preview += "..."
577
-
578
- if is_chinese:
579
- output = f"""
580
- ## ✅ 内容提取成功!
581
-
582
- **网址:** {result['url']}
583
- **标题:** {result.get('title', '无标题')}
584
- **时间:** {result['execution_time']}秒
585
- **内容长度:** {content_length:,} 字符
586
- **中文字符数:** {result.get('chinese_char_count', 0):,}
587
-
588
- ### 内容预览:
589
- {preview}
590
- """
591
- else:
592
- output = f"""
593
- ## ✅ Content Extracted Successfully!
594
 
595
  **URL:** {result['url']}
596
- **Title:** {result.get('title', 'No title')}
597
  **Time:** {result['execution_time']}s
598
  **Content Length:** {content_length:,} characters
599
 
600
- ### Content Preview:
601
  {preview}
 
 
602
  """
603
  return output, result
604
  else:
605
- error_msg = result.get("error", "未知错误")
606
- return f"## ❌ 错误\n\n{error_msg}", result
 
 
 
 
 
 
607
 
608
  # Create Gradio interface
609
  gradio_interface = gr.Interface(
610
  fn=gradio_extract,
611
  inputs=gr.Textbox(
612
- label="网站网址 / Website URL",
613
- placeholder="请输入网址 (如: https://zhihu.com)",
614
- value="https://zhihu.com"
615
  ),
616
  outputs=[
617
- gr.Markdown(label="结果 / Result"),
618
- gr.JSON(label="API响应 / API Response")
619
  ],
620
- title="🧠 智能内容提取器 (中文优化) / Smart Content Extractor (Chinese Optimized)",
621
- description="专门优化中文网站的内容提取器,去除导航、广告、页脚等无关内容 / Optimized for Chinese websites, removes navigation, ads, footers, etc.",
622
  examples=[
623
- ["https://zhihu.com"],
624
- ["https://baidu.com"],
625
- ["https://news.sina.com.cn"],
626
  ["https://example.com"],
627
- ["https://en.wikipedia.org/wiki/Artificial_intelligence"]
 
628
  ]
629
  )
630
 
@@ -632,7 +497,6 @@ gradio_interface = gr.Interface(
632
  # MOUNT GRADIO TO FASTAPI
633
  # ==============================================
634
 
635
- # Mount Gradio app to FastAPI
636
  app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
637
 
638
  # ==============================================
@@ -641,11 +505,14 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
641
 
642
  if __name__ == "__main__":
643
  print("\n" + "="*60)
644
- print("🧠 智能内容提取器启动中...")
645
- print("Smart Content Extractor Starting...")
 
 
 
 
646
  print("="*60)
647
- print("API端点 / API Endpoint: POST /extract")
648
- print("网页界面 / Web Interface: GET /")
649
  print("="*60 + "\n")
650
 
651
  uvicorn.run(
 
1
  # ==============================================
2
+ # FREE SCREENSHOT SCRAPER FOR N8N
3
  # ==============================================
4
 
5
  import gradio as gr
 
8
  import time
9
  import re
10
  import html
11
+ import base64
12
+ from io import BytesIO
13
+ from PIL import Image
14
+ import pytesseract # Free OCR
15
+ from typing import Dict, Any
16
  from fastapi import FastAPI, Request
17
  import uvicorn
18
 
19
  # ==============================================
20
+ # FREE SCREENSHOT SCRAPER
21
  # ==============================================
22
 
23
+ class FreeScreenshotScraper:
24
+ """Free scraper using screenshot APIs + fallback"""
25
 
26
  def __init__(self):
27
+ # Free screenshot APIs (no API key needed)
28
+ self.screenshot_apis = [
29
+ {
30
+ "url": lambda u: f"https://s0.wp.com/mshots/v1/{u}?w=1024",
31
+ "name": "wordpress_mshots"
32
+ },
33
+ {
34
+ "url": lambda u: f"https://render-tron.appspot.com/screenshot/{u}?width=1024&height=768",
35
+ "name": "render_tron"
36
+ },
37
+ {
38
+ "url": lambda u: f"https://image.thum.io/get/width/1024/crop/768/noanimate/{u}",
39
+ "name": "thumio"
40
+ },
41
  ]
42
 
43
+ # Free HTML content APIs
44
+ self.html_apis = [
45
+ {
46
+ "url": lambda u: f"https://r.jina.ai/{u}",
47
+ "name": "jina_reader",
48
+ "headers": {"Accept": "application/json"}
49
+ },
50
+ {
51
+ "url": lambda u: f"https://extractorapi.com/api/v1/extractor?apikey=demo&url={u}",
52
+ "name": "extractor_api"
53
+ },
 
 
 
 
 
 
 
54
  ]
55
 
 
 
 
 
 
 
 
56
  def extract_content(self, url: str) -> Dict[str, Any]:
57
+ """Extract content using free APIs"""
58
  start_time = time.time()
59
 
60
+ print(f"🌐 Extracting: {url}")
61
 
62
  # Ensure URL has protocol
63
  if not url.startswith(('http://', 'https://')):
64
  url = 'https://' + url
65
 
66
+ # Strategy 1: Try Jina Reader API (best for content extraction)
67
+ print(" Trying Jina Reader API...")
68
+ jina_result = self._try_jina_reader(url)
69
+ if jina_result["success"]:
70
+ jina_result["execution_time"] = round(time.time() - start_time, 2)
71
+ jina_result["method"] = "jina_reader_api"
72
+ return jina_result
73
+
74
+ # Strategy 2: Try other HTML APIs
75
+ print(" Trying other HTML APIs...")
76
+ for api in self.html_apis[1:]:
77
+ result = self._try_api(api, url)
78
+ if result["success"]:
79
+ result["execution_time"] = round(time.time() - start_time, 2)
80
+ result["method"] = api["name"]
81
+ return result
82
+
83
+ # Strategy 3: Try direct request with smart headers
84
+ print(" Trying direct request...")
85
+ direct_result = self._try_direct_request(url)
86
+ if direct_result["success"]:
87
+ direct_result["execution_time"] = round(time.time() - start_time, 2)
88
+ direct_result["method"] = "direct_with_fallback"
89
+ return direct_result
90
+
91
+ # Strategy 4: Try screenshot APIs as last resort
92
+ print(" Trying screenshot APIs...")
93
+ for api in self.screenshot_apis:
94
+ result = self._try_screenshot_api(api, url)
95
+ if result["success"]:
96
+ result["execution_time"] = round(time.time() - start_time, 2)
97
+ result["method"] = f"screenshot_{api['name']}"
98
+ return result
99
+
100
+ # All failed
101
+ return {
102
+ "success": False,
103
+ "url": url,
104
+ "error": "All free methods failed",
105
+ "execution_time": round(time.time() - start_time, 2),
106
+ "suggestions": [
107
+ "Try a different URL",
108
+ "Website may block automated access",
109
+ "Try using Jina Reader directly: https://r.jina.ai/your-url"
110
+ ]
111
+ }
112
+
113
+ def _try_jina_reader(self, url: str) -> Dict[str, Any]:
114
+ """Try Jina Reader API (free, no API key needed)"""
115
  try:
116
+ api_url = f"https://r.jina.ai/{url}"
117
+ headers = {
118
+ "User-Agent": "Mozilla/5.0",
119
+ "Accept": "application/json",
120
+ }
121
 
122
+ response = requests.get(api_url, headers=headers, timeout=30)
 
 
123
 
124
+ if response.status_code == 200:
125
+ # Jina returns clean text directly
126
+ content = response.text
127
+
128
+ # Try to parse as JSON first
129
+ try:
130
+ data = json.loads(content)
131
+ if "data" in data:
132
+ content = data["data"]["content"] if "content" in data["data"] else str(data["data"])
133
+ except:
134
+ pass # Keep as text
135
+
136
+ # Extract title if possible
137
+ title = ""
138
+ title_match = re.search(r'<title[^>]*>(.*?)</title>', content, re.IGNORECASE)
139
+ if title_match:
140
+ title = title_match.group(1)
141
+
142
+ # Clean content
143
+ cleaned = self._clean_content(content)
144
+
145
+ return {
146
+ "success": True,
147
+ "url": url,
148
+ "title": title[:200] if title else "Extracted via Jina Reader",
149
+ "main_content": cleaned[:30000],
150
+ "content_length": len(cleaned),
151
+ "source": "jina_reader",
152
+ "note": "Content extracted via free Jina Reader API"
153
+ }
154
 
155
+ return {"success": False, "error": f"Jina API status: {response.status_code}"}
 
156
 
157
+ except Exception as e:
158
+ return {"success": False, "error": f"Jina API error: {str(e)}"}
159
+
160
+ def _try_api(self, api: dict, url: str) -> Dict[str, Any]:
161
+ """Try other free APIs"""
162
+ try:
163
+ api_url = api["url"](url)
164
+ headers = api.get("headers", {"User-Agent": "Mozilla/5.0"})
165
 
166
+ response = requests.get(api_url, headers=headers, timeout=15)
 
 
167
 
168
+ if response.status_code == 200:
169
+ content = response.text
170
+
171
+ # Try to parse JSON
172
+ try:
173
+ data = json.loads(content)
174
+ # Extract content from common API formats
175
+ if "text" in data:
176
+ content = data["text"]
177
+ elif "content" in data:
178
+ content = data["content"]
179
+ elif "article" in data:
180
+ content = data["article"]
181
+ except:
182
+ pass
183
+
184
+ cleaned = self._clean_content(content)
185
+
186
+ return {
187
+ "success": True,
188
+ "url": url,
189
+ "main_content": cleaned[:20000],
190
+ "content_length": len(cleaned)
191
+ }
192
 
193
+ return {"success": False}
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ except:
196
+ return {"success": False}
 
 
 
 
 
197
 
198
+ def _try_direct_request(self, url: str) -> Dict[str, Any]:
199
+ """Try direct request with various strategies"""
200
+ strategies = [
201
+ self._direct_request_with_headers,
202
+ self._direct_request_as_googlebot,
203
+ self._direct_request_with_referer,
204
+ ]
 
205
 
206
+ for strategy in strategies:
207
+ try:
208
+ result = strategy(url)
209
+ if result["success"]:
210
+ return result
211
+ except:
212
+ continue
213
 
214
+ return {"success": False}
 
215
 
216
+ def _direct_request_with_headers(self, url: str) -> Dict[str, Any]:
217
+ """Direct request with browser-like headers"""
218
+ headers = {
219
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
220
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
221
+ "Accept-Language": "en-US,en;q=0.5",
222
+ "Accept-Encoding": "gzip, deflate",
223
+ "Connection": "keep-alive",
224
+ "Upgrade-Insecure-Requests": "1",
225
+ "Cache-Control": "max-age=0",
226
+ }
 
 
 
 
 
 
227
 
228
+ response = requests.get(url, headers=headers, timeout=10)
 
 
 
 
 
229
 
230
+ if response.status_code == 200:
231
+ content = self._extract_from_html(response.text)
232
+ cleaned = self._clean_content(content)
233
+
234
+ return {
235
+ "success": True,
236
+ "content": cleaned
237
+ }
238
+
239
+ return {"success": False}
240
 
241
+ def _direct_request_as_googlebot(self, url: str) -> Dict[str, Any]:
242
+ """Pretend to be Googlebot"""
243
+ headers = {
244
+ "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
245
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
246
+ }
247
 
248
+ response = requests.get(url, headers=headers, timeout=10)
 
249
 
250
+ if response.status_code == 200:
251
+ content = self._extract_from_html(response.text)
252
+ cleaned = self._clean_content(content)
 
 
253
 
254
+ return {
255
+ "success": True,
256
+ "content": cleaned
257
+ }
 
 
 
 
 
 
 
 
258
 
259
+ return {"success": False}
260
+
261
+ def _direct_request_with_referer(self, url: str) -> Dict[str, Any]:
262
+ """Request with referer"""
263
+ headers = {
264
+ "User-Agent": "Mozilla/5.0",
265
+ "Referer": "https://www.google.com/",
266
+ "Accept": "text/html",
267
+ }
268
+
269
+ response = requests.get(url, headers=headers, timeout=10)
270
+
271
+ if response.status_code == 200:
272
+ content = self._extract_from_html(response.text)
273
+ cleaned = self._clean_content(content)
 
 
 
 
 
 
274
 
275
+ return {
276
+ "success": True,
277
+ "content": cleaned
278
+ }
279
 
280
+ return {"success": False}
 
281
 
282
+ def _try_screenshot_api(self, api: dict, url: str) -> Dict[str, Any]:
283
+ """Try screenshot API"""
284
+ try:
285
+ api_url = api["url"](url)
286
+ headers = {"User-Agent": "Mozilla/5.0"}
 
 
 
287
 
288
+ response = requests.get(api_url, headers=headers, timeout=15)
 
289
 
290
+ if response.status_code == 200 and len(response.content) > 1000:
291
+ # Check if it's actually an image
292
+ try:
293
+ img = Image.open(BytesIO(response.content))
294
+ img.verify()
295
+
296
+ # Try OCR if available
297
+ try:
298
+ text = pytesseract.image_to_string(img)
299
+ cleaned = self._clean_content(text)
300
+
301
+ return {
302
+ "success": True,
303
+ "url": url,
304
+ "main_content": cleaned[:15000],
305
+ "content_length": len(cleaned),
306
+ "note": "Content extracted from screenshot via OCR"
307
+ }
308
+ except:
309
+ return {"success": False, "error": "OCR not available"}
310
+
311
+ except:
312
+ return {"success": False}
313
 
314
+ return {"success": False}
 
 
315
 
316
+ except:
317
+ return {"success": False}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
+ def _extract_from_html(self, html_content: str) -> str:
320
+ """Extract text from HTML"""
321
+ # Remove scripts and styles
322
  html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
323
  html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
 
324
 
325
  # Remove unwanted tags
326
+ unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu']
327
  for tag in unwanted_tags:
328
  html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
329
 
330
+ # Extract text
331
  text = re.sub(r'<[^>]+>', ' ', html_content)
 
 
332
  text = html.unescape(text)
333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  return text
335
 
336
+ def _clean_content(self, content: str) -> str:
337
+ """Clean content"""
338
  if not content:
339
  return ""
340
 
341
+ # Replace multiple whitespace
342
  content = re.sub(r'\s+', ' ', content)
343
 
344
+ # Remove control characters
345
  content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
346
 
347
+ # Remove excessive line breaks
348
+ content = re.sub(r'\n{3,}', '\n\n', content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
  return content.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
  # ==============================================
353
  # INITIALIZE
354
  # ==============================================
355
 
356
+ scraper = FreeScreenshotScraper()
357
 
358
  # ==============================================
359
  # FASTAPI APP
360
  # ==============================================
361
 
 
362
  fastapi_app = FastAPI(
363
+ title="Free Content Extractor",
364
+ description="Uses free APIs to extract content from websites",
365
+ version="1.0"
366
  )
367
 
 
368
  from fastapi.middleware.cors import CORSMiddleware
369
  from fastapi.responses import JSONResponse
370
 
 
379
  @fastapi_app.get("/")
380
  async def root():
381
  return {
382
+ "service": "Free Content Extractor",
383
+ "version": "1.0",
384
+ "description": "Uses free APIs (Jina Reader, etc.) to extract website content",
385
  "endpoints": {
386
+ "GET /": "This info",
387
+ "GET /health": "Health check",
388
+ "POST /extract": "Extract content (for n8n)"
389
  },
390
+ "free_apis_used": [
391
+ "Jina Reader (https://r.jina.ai/)",
392
+ "WordPress mShots",
393
+ "Render-Tron",
394
+ "ExtractorAPI (demo)"
395
+ ]
396
  }
397
 
398
  @fastapi_app.get("/health")
399
  async def health():
400
+ return {"status": "healthy", "timestamp": time.time()}
 
 
 
401
 
402
  @fastapi_app.post("/extract")
403
  async def api_extract(request: Request):
404
+ """API endpoint for n8n"""
405
  try:
406
  body = await request.json()
407
  url = body.get("url", "").strip()
 
409
  if not url:
410
  return JSONResponse(
411
  status_code=400,
412
+ content={"success": False, "error": "URL is required"}
413
  )
414
 
415
+ print(f"📨 Request: {url}")
416
+ result = scraper.extract_content(url)
417
 
418
  return result
419
 
420
  except json.JSONDecodeError:
421
  return JSONResponse(
422
  status_code=400,
423
+ content={"success": False, "error": "Invalid JSON"}
424
  )
425
  except Exception as e:
426
  return JSONResponse(
427
  status_code=500,
428
+ content={"success": False, "error": str(e)}
429
  )
430
 
431
  # ==============================================
 
433
  # ==============================================
434
 
435
  def gradio_extract(url: str):
436
+ """Gradio interface"""
437
  if not url:
438
+ return "❌ Please enter a URL", {}
439
 
440
+ result = scraper.extract_content(url)
441
 
442
  if result["success"]:
443
  content = result["main_content"]
444
  content_length = result["content_length"]
445
+
446
+ preview = content[:800]
447
+ if len(content) > 800:
448
+ preview += "..."
449
+
450
+ output = f"""
451
+ ## Success!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
 
453
  **URL:** {result['url']}
454
+ **Method:** {result.get('method', 'free_api')}
455
  **Time:** {result['execution_time']}s
456
  **Content Length:** {content_length:,} characters
457
 
458
+ ### Preview:
459
  {preview}
460
+
461
+ *Using free APIs - may not work on all websites*
462
  """
463
  return output, result
464
  else:
465
+ error = result.get("error", "Unknown error")
466
+ suggestions = result.get("suggestions", [])
467
+
468
+ suggestion_text = ""
469
+ if suggestions:
470
+ suggestion_text = "\n\n**Suggestions:**\n" + "\n".join([f"• {s}" for s in suggestions])
471
+
472
+ return f"## ❌ Error\n\n{error}{suggestion_text}", result
473
 
474
  # Create Gradio interface
475
  gradio_interface = gr.Interface(
476
  fn=gradio_extract,
477
  inputs=gr.Textbox(
478
+ label="Website URL",
479
+ placeholder="https://www.sinchew.com.my/",
480
+ value="https://www.sinchew.com.my/"
481
  ),
482
  outputs=[
483
+ gr.Markdown(label="Result"),
484
+ gr.JSON(label="API Response")
485
  ],
486
+ title="🌐 Free Content Extractor for n8n",
487
+ description="Uses free APIs to extract content. Works with most websites.",
488
  examples=[
489
+ ["https://www.sinchew.com.my/"],
 
 
490
  ["https://example.com"],
491
+ ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
492
+ ["https://news.ycombinator.com"]
493
  ]
494
  )
495
 
 
497
  # MOUNT GRADIO TO FASTAPI
498
  # ==============================================
499
 
 
500
  app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
501
 
502
  # ==============================================
 
505
 
506
  if __name__ == "__main__":
507
  print("\n" + "="*60)
508
+ print("🌐 Free Content Extractor Starting")
509
+ print("="*60)
510
+ print("Using free APIs:")
511
+ print("• Jina Reader (r.jina.ai)")
512
+ print("• WordPress mShots")
513
+ print("• Render-Tron")
514
  print("="*60)
515
+ print("API Endpoint: POST /extract")
 
516
  print("="*60 + "\n")
517
 
518
  uvicorn.run(