yukee1992 commited on
Commit
40f056b
·
verified ·
1 Parent(s): f9380bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +372 -186
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==============================================
2
- # SMART CONTENT EXTRACTOR FOR N8N
3
  # ==============================================
4
 
5
  import gradio as gr
@@ -8,22 +8,56 @@ import json
8
  import time
9
  import re
10
  import html
11
- from typing import Dict, Any
 
12
  from fastapi import FastAPI, Request
13
  import uvicorn
14
 
15
  # ==============================================
16
- # SMART CONTENT EXTRACTOR
17
  # ==============================================
18
 
19
- class SmartContentExtractor:
20
- """Extracts only main content, removes navigation, ads, footers, etc."""
21
 
22
  def __init__(self):
23
- self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def extract_content(self, url: str) -> Dict[str, Any]:
26
- """Extract only main content from webpage"""
27
  start_time = time.time()
28
 
29
  print(f"🌐 Extracting content from: {url}")
@@ -33,41 +67,39 @@ class SmartContentExtractor:
33
  url = 'https://' + url
34
 
35
  try:
36
- # Fetch the page
37
- headers = {
38
- 'User-Agent': self.user_agent,
39
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
40
- 'Accept-Language': 'en-US,en;q=0.9',
41
- }
42
 
43
- response = requests.get(url, headers=headers, timeout=15)
 
44
  response.raise_for_status()
45
 
46
- # Get encoding
47
- if response.encoding is None:
48
- response.encoding = 'utf-8'
49
-
50
- html_content = response.text
51
 
52
- # Extract only main content
53
- main_content = self._extract_main_content(html_content)
54
 
55
- # Clean content
56
- cleaned_content = self._clean_content(main_content)
57
-
58
- # Extract title (separately)
59
- title = self._extract_title(html_content)
60
 
61
  # Extract metadata
62
- metadata = self._extract_metadata(html_content)
 
 
 
 
63
 
64
  return {
65
  "success": True,
66
  "url": url,
 
67
  "title": title,
68
- "main_content": cleaned_content[:20000], # Limit to 20k chars
69
  "content_length": len(cleaned_content),
70
- "content_preview": cleaned_content[:500] + ("..." if len(cleaned_content) > 500 else ""),
 
 
71
  "metadata": metadata,
72
  "status_code": response.status_code,
73
  "execution_time": round(time.time() - start_time, 2)
@@ -81,81 +113,75 @@ class SmartContentExtractor:
81
  "execution_time": round(time.time() - start_time, 2)
82
  }
83
 
84
- def _extract_main_content(self, html_content: str) -> str:
85
- """Extract only the main content, removing navigation, ads, footers, etc."""
86
-
87
- # Remove unwanted sections first
88
- html_content = self._remove_unwanted_sections(html_content)
89
-
90
- # Try to extract using regex patterns for main content
91
- content = self._extract_with_regex(html_content)
92
 
93
- # If we got decent content, return it
94
- if len(content.strip()) > 200:
95
- return content
 
 
 
96
 
97
- # Fallback: remove all HTML tags and get text
98
- return self._extract_all_text(html_content)
99
 
100
- def _remove_unwanted_sections(self, html_content: str) -> str:
101
- """Remove navigation, ads, footers, sidebars, etc."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- # Patterns to remove
104
- unwanted_patterns = [
105
- # Navigation
106
- r'<nav[^>]*>.*?</nav>',
107
- r'<header[^>]*>.*?</header>',
108
-
109
- # Footers
110
- r'<footer[^>]*>.*?</footer>',
111
-
112
- # Sidebars
113
- r'<aside[^>]*>.*?</aside>',
114
- r'<div[^>]*class="[^"]*sidebar[^"]*"[^>]*>.*?</div>',
115
-
116
- # Ads and banners
117
- r'<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>',
118
- r'<div[^>]*class="[^"]*banner[^"]*"[^>]*>.*?</div>',
119
- r'<ins[^>]*>.*?</ins>',
120
-
121
- # Social media/widgets
122
- r'<div[^>]*class="[^"]*social[^"]*"[^>]*>.*?</div>',
123
- r'<div[^>]*class="[^"]*widget[^"]*"[^>]*>.*?</div>',
124
-
125
- # Comments
126
- r'<div[^>]*class="[^"]*comment[^"]*"[^>]*>.*?</div>',
127
-
128
- # Related content
129
- r'<div[^>]*class="[^"]*related[^"]*"[^>]*>.*?</div>',
130
-
131
- # Scripts and styles
132
- r'<script[^>]*>.*?</script>',
133
- r'<style[^>]*>.*?</style>',
134
- r'<!--.*?-->',
135
- ]
136
-
137
- cleaned_html = html_content
138
- for pattern in unwanted_patterns:
139
- cleaned_html = re.sub(pattern, ' ', cleaned_html, flags=re.DOTALL | re.IGNORECASE)
140
 
141
- return cleaned_html
142
 
143
- def _extract_with_regex(self, html_content: str) -> str:
144
- """Extract content using regex patterns"""
 
 
 
145
 
 
146
  content_patterns = [
147
- # Look for article tags
 
 
 
 
148
  r'<article[^>]*>(.*?)</article>',
149
 
150
- # Look for main tags
151
  r'<main[^>]*>(.*?)</main>',
152
 
153
- # Look for divs with content classes
154
- r'<div[^>]*class="[^"]*(post-content|article-content|entry-content|story-content)[^"]*"[^>]*>(.*?)</div>',
155
- r'<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>',
156
 
157
- # Look for section with content
158
- r'<section[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</section>',
159
  ]
160
 
161
  all_content = []
@@ -182,138 +208,268 @@ class SmartContentExtractor:
182
 
183
  return combined
184
 
185
- return ""
 
186
 
187
- def _extract_all_text(self, html_content: str) -> str:
188
- """Extract all text as fallback, but clean it well"""
189
- # Remove scripts, styles, comments first
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
191
  html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
192
  html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
193
 
194
- # Remove common unwanted tags
195
- unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'ins']
196
  for tag in unwanted_tags:
197
  html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
198
 
199
- # Remove HTML tags
200
  text = re.sub(r'<[^>]+>', ' ', html_content)
201
 
202
  # Decode HTML entities
203
  text = html.unescape(text)
204
 
205
- # Remove very short lines
206
- lines = text.split('\n')
207
- filtered_lines = []
208
- for line in lines:
209
- line = line.strip()
210
- if len(line) > 30: # Only keep lines longer than 30 chars
211
- filtered_lines.append(line)
212
- elif any(word in line.lower() for word in ['home', 'about', 'contact', 'login', 'sign up', 'search']):
213
- continue
214
-
215
- return '\n\n'.join(filtered_lines)
 
 
 
216
 
217
- def _clean_content(self, content: str) -> str:
218
- """Clean and normalize the extracted content"""
219
  if not content:
220
  return ""
221
 
222
  # Replace multiple whitespace with single space
223
  content = re.sub(r'\s+', ' ', content)
224
 
225
- # Remove control characters
226
  content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
227
 
228
- # Remove common unwanted phrases
229
  unwanted_phrases = [
 
230
  r'sign up for our newsletter',
231
  r'subscribe to our newsletter',
232
  r'follow us on',
233
- r'like us on facebook',
234
- r'follow us on twitter',
235
  r'share this article',
236
  r'read more',
237
  r'continue reading',
238
  r'advertisement',
239
- r'sponsored content',
240
- r'related articles',
241
- r'you may also like',
242
  r'click here',
243
  r'learn more',
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  ]
245
 
246
  for phrase in unwanted_phrases:
247
  content = re.sub(phrase, '', content, flags=re.IGNORECASE)
248
 
249
- # Remove email addresses
250
  content = re.sub(r'\S+@\S+\.\S+', '', content)
251
-
252
- # Remove URLs
253
  content = re.sub(r'https?://\S+', '', content)
254
 
255
- # Remove excessive punctuation
256
- content = re.sub(r'[.!?]{3,}', '.', content)
257
-
258
- # Normalize spaces around punctuation
259
- content = re.sub(r'\s+([.,!?;:])', r'\1', content)
260
- content = re.sub(r'([.,!?;:])\s+', r'\1 ', content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
- # Split into paragraphs and filter
263
- sentences = content.split('. ')
264
  clean_sentences = []
265
-
266
  for sentence in sentences:
267
  sentence = sentence.strip()
268
- if len(sentence) < 5:
269
  continue
270
 
271
- if len(sentence) > 30:
272
- clean_sentences.append(sentence)
273
-
274
- # Join back with proper spacing
275
- content = '. '.join(clean_sentences)
 
 
 
 
 
 
 
 
 
276
 
277
  return content.strip()
278
 
279
- def _extract_title(self, html_content: str) -> str:
280
- """Extract page title"""
281
- title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
282
  if title_match:
283
  title = title_match.group(1)
284
  title = re.sub(r'\s+', ' ', title).strip()
285
- return title[:200]
286
- return "No title found"
 
 
 
 
 
287
 
288
  def _extract_metadata(self, html_content: str) -> Dict[str, str]:
289
- """Extract basic metadata"""
290
  metadata = {}
291
 
292
- # Meta description
293
- desc_match = re.search(r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
294
- html_content, re.IGNORECASE)
295
- if desc_match:
296
- metadata['description'] = desc_match.group(1)[:300]
 
 
 
 
 
 
297
 
298
- # Meta keywords
299
  keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
300
  html_content, re.IGNORECASE)
301
  if keywords_match:
302
- metadata['keywords'] = keywords_match.group(1)[:300]
303
 
304
  # Author
305
  author_match = re.search(r'<meta[^>]*name=["\']author["\'][^>]*content=["\'](.*?)["\']',
306
  html_content, re.IGNORECASE)
307
  if author_match:
308
- metadata['author'] = author_match.group(1)[:200]
 
 
 
 
 
309
 
310
  return metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  # ==============================================
313
  # INITIALIZE
314
  # ==============================================
315
 
316
- extractor = SmartContentExtractor()
317
 
318
  # ==============================================
319
  # FASTAPI APP
@@ -321,9 +477,9 @@ extractor = SmartContentExtractor()
321
 
322
  # Create FastAPI app
323
  fastapi_app = FastAPI(
324
- title="Smart Content Extractor",
325
- description="Extracts only main content from webpages",
326
- version="1.0"
327
  )
328
 
329
  # Add CORS middleware
@@ -341,13 +497,17 @@ fastapi_app.add_middleware(
341
  @fastapi_app.get("/")
342
  async def root():
343
  return {
344
- "service": "Smart Content Extractor",
345
- "version": "1.0",
346
- "description": "Extracts only main content from webpages",
347
  "endpoints": {
348
- "GET /": "This info",
349
- "GET /health": "Health check",
350
- "POST /extract": "Extract main content (for n8n)"
 
 
 
 
351
  }
352
  }
353
 
@@ -360,7 +520,7 @@ async def health():
360
 
361
  @fastapi_app.post("/extract")
362
  async def api_extract(request: Request):
363
- """API endpoint for n8n - extracts only main content"""
364
  try:
365
  body = await request.json()
366
  url = body.get("url", "").strip()
@@ -368,10 +528,10 @@ async def api_extract(request: Request):
368
  if not url:
369
  return JSONResponse(
370
  status_code=400,
371
- content={"success": False, "error": "URL parameter is required"}
372
  )
373
 
374
- print(f"📨 Content extraction request: {url}")
375
  result = extractor.extract_content(url)
376
 
377
  return result
@@ -379,12 +539,12 @@ async def api_extract(request: Request):
379
  except json.JSONDecodeError:
380
  return JSONResponse(
381
  status_code=400,
382
- content={"success": False, "error": "Invalid JSON payload"}
383
  )
384
  except Exception as e:
385
  return JSONResponse(
386
  status_code=500,
387
- content={"success": False, "error": f"Internal error: {str(e)}"}
388
  )
389
 
390
  # ==============================================
@@ -394,24 +554,46 @@ async def api_extract(request: Request):
394
  def gradio_extract(url: str):
395
  """Gradio interface function"""
396
  if not url:
397
- return "❌ Please enter a URL", {}
398
 
399
  result = extractor.extract_content(url)
400
 
401
  if result["success"]:
402
  content = result["main_content"]
403
  content_length = result["content_length"]
 
404
 
405
  # Create preview
406
- preview = content[:500]
407
- if len(content) > 500:
408
- preview += "..."
409
-
410
- output = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  ## ✅ Content Extracted Successfully!
412
 
413
  **URL:** {result['url']}
414
- **Title:** {result.get('title', 'N/A')}
415
  **Time:** {result['execution_time']}s
416
  **Content Length:** {content_length:,} characters
417
 
@@ -420,26 +602,29 @@ def gradio_extract(url: str):
420
  """
421
  return output, result
422
  else:
423
- return f"## ❌ Error\n\n{result.get('error', 'Unknown error')}", result
 
424
 
425
- # Create Gradio interface (removed allow_flagging parameter)
426
  gradio_interface = gr.Interface(
427
  fn=gradio_extract,
428
  inputs=gr.Textbox(
429
- label="Website URL",
430
- placeholder="https://example.com",
431
- value="https://en.wikipedia.org/wiki/Artificial_intelligence"
432
  ),
433
  outputs=[
434
- gr.Markdown(label="Result"),
435
- gr.JSON(label="API Response")
436
  ],
437
- title="🧠 Smart Content Extractor for n8n",
438
- description="Extracts ONLY main content - removes navigation, ads, footers, sidebars, etc.",
439
  examples=[
440
- ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
 
 
441
  ["https://example.com"],
442
- ["https://news.ycombinator.com"]
443
  ]
444
  )
445
 
@@ -456,10 +641,11 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
456
 
457
  if __name__ == "__main__":
458
  print("\n" + "="*60)
459
- print("🧠 Smart Content Extractor Starting")
 
460
  print("="*60)
461
- print("API Endpoint: POST /extract")
462
- print("Web Interface: GET /")
463
  print("="*60 + "\n")
464
 
465
  uvicorn.run(
 
1
  # ==============================================
2
+ # SMART CONTENT EXTRACTOR FOR CHINESE WEBSITES
3
  # ==============================================
4
 
5
  import gradio as gr
 
8
  import time
9
  import re
10
  import html
11
+ import chardet
12
+ from typing import Dict, Any, Optional
13
  from fastapi import FastAPI, Request
14
  import uvicorn
15
 
16
  # ==============================================
17
+ # ENHANCED CONTENT EXTRACTOR FOR CHINESE
18
  # ==============================================
19
 
20
+ class ChineseContentExtractor:
21
+ """Enhanced content extractor optimized for Chinese websites"""
22
 
23
  def __init__(self):
24
+ # Chinese browser user agents
25
+ self.user_agents = [
26
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
27
+ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
28
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/120.0",
29
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
30
+ ]
31
+
32
+ # Common Chinese website patterns
33
+ self.chinese_site_patterns = [
34
+ r'\.cn$',
35
+ r'\.com\.cn$',
36
+ r'baidu\.com',
37
+ r'qq\.com',
38
+ r'sina\.com\.cn',
39
+ r'sohu\.com',
40
+ r'163\.com',
41
+ r'jd\.com',
42
+ r'taobao\.com',
43
+ r'alibaba\.com',
44
+ r'zhihu\.com',
45
+ r'bilibili\.com',
46
+ r'weibo\.com',
47
+ r'douyin\.com',
48
+ r'douban\.com',
49
+ r'ximalaya\.com',
50
+ ]
51
+
52
+ def is_chinese_website(self, url: str) -> bool:
53
+ """Check if URL is a Chinese website"""
54
+ for pattern in self.chinese_site_patterns:
55
+ if re.search(pattern, url, re.IGNORECASE):
56
+ return True
57
+ return False
58
 
59
  def extract_content(self, url: str) -> Dict[str, Any]:
60
+ """Extract content with Chinese website support"""
61
  start_time = time.time()
62
 
63
  print(f"🌐 Extracting content from: {url}")
 
67
  url = 'https://' + url
68
 
69
  try:
70
+ # Determine if Chinese website
71
+ is_chinese = self.is_chinese_website(url)
 
 
 
 
72
 
73
+ # Fetch the page with appropriate settings
74
+ response = self._fetch_with_encoding(url, is_chinese)
75
  response.raise_for_status()
76
 
77
+ # Get correct encoding
78
+ content, encoding = self._decode_content(response)
 
 
 
79
 
80
+ # Extract main content
81
+ main_content = self._extract_main_content(content, is_chinese)
82
 
83
+ # Clean content (preserve Chinese characters)
84
+ cleaned_content = self._clean_content(main_content, is_chinese)
 
 
 
85
 
86
  # Extract metadata
87
+ title = self._extract_title(content, encoding)
88
+ metadata = self._extract_metadata(content)
89
+
90
+ # Calculate content stats
91
+ chinese_char_count = self._count_chinese_characters(cleaned_content)
92
 
93
  return {
94
  "success": True,
95
  "url": url,
96
+ "is_chinese_website": is_chinese,
97
  "title": title,
98
+ "main_content": cleaned_content[:25000], # Increased limit for Chinese
99
  "content_length": len(cleaned_content),
100
+ "chinese_char_count": chinese_char_count,
101
+ "encoding_used": encoding,
102
+ "content_preview": cleaned_content[:800] + ("..." if len(cleaned_content) > 800 else ""),
103
  "metadata": metadata,
104
  "status_code": response.status_code,
105
  "execution_time": round(time.time() - start_time, 2)
 
113
  "execution_time": round(time.time() - start_time, 2)
114
  }
115
 
116
+ def _fetch_with_encoding(self, url: str, is_chinese: bool) -> requests.Response:
117
+ """Fetch webpage with proper encoding handling"""
118
+ headers = {
119
+ 'User-Agent': self.user_agents[0],
120
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
121
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8' if is_chinese else 'en-US,en;q=0.9',
122
+ 'Accept-Encoding': 'gzip, deflate',
123
+ }
124
 
125
+ # Add Chinese-specific headers
126
+ if is_chinese:
127
+ headers.update({
128
+ 'Accept-Language': 'zh-CN,zh;q=0.9',
129
+ 'Cache-Control': 'no-cache',
130
+ })
131
 
132
+ response = requests.get(url, headers=headers, timeout=15)
133
+ return response
134
 
135
+ def _decode_content(self, response: requests.Response) -> tuple[str, str]:
136
+ """Decode content with proper encoding detection"""
137
+ # Try to detect encoding
138
+ if response.encoding:
139
+ encoding = response.encoding.lower()
140
+ else:
141
+ # Use chardet to detect encoding
142
+ detected = chardet.detect(response.content)
143
+ encoding = detected.get('encoding', 'utf-8').lower()
144
+
145
+ # Handle common Chinese encodings
146
+ if encoding in ['gb2312', 'gbk', 'gb18030']:
147
+ encoding = 'gb18030' # Most comprehensive Chinese encoding
148
+ elif encoding == 'big5':
149
+ encoding = 'big5' # Traditional Chinese
150
+ else:
151
+ encoding = 'utf-8' # Default to UTF-8
152
 
153
+ try:
154
+ content = response.content.decode(encoding, errors='replace')
155
+ except:
156
+ # Fallback to UTF-8 with error replacement
157
+ content = response.content.decode('utf-8', errors='replace')
158
+ encoding = 'utf-8'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ return content, encoding
161
 
162
+ def _extract_main_content(self, html_content: str, is_chinese: bool) -> str:
163
+ """Extract main content with Chinese website optimizations"""
164
+
165
+ # Remove unwanted sections
166
+ html_content = self._remove_unwanted_sections(html_content, is_chinese)
167
 
168
+ # Chinese websites often have specific content patterns
169
  content_patterns = [
170
+ # Common Chinese content containers
171
+ r'<div[^>]*class="[^"]*(content|article|post|正文|内容)[^"]*"[^>]*>(.*?)</div>',
172
+ r'<div[^>]*id="[^"]*(content|article|post|正文|内容)[^"]*"[^>]*>(.*?)</div>',
173
+
174
+ # Article tags
175
  r'<article[^>]*>(.*?)</article>',
176
 
177
+ # Main content area
178
  r'<main[^>]*>(.*?)</main>',
179
 
180
+ # Chinese specific patterns
181
+ r'<div[^>]*class="[^"]*(detail|content-main|article-content)[^"]*"[^>]*>(.*?)</div>',
 
182
 
183
+ # For news websites
184
+ r'<div[^>]*class="[^"]*(news-content|news-body|news-article)[^"]*"[^>]*>(.*?)</div>',
185
  ]
186
 
187
  all_content = []
 
208
 
209
  return combined
210
 
211
+ # Fallback: extract all text and clean
212
+ return self._extract_all_text(html_content, is_chinese)
213
 
214
+ def _remove_unwanted_sections(self, html_content: str, is_chinese: bool) -> str:
215
+ """Remove unwanted sections with Chinese-specific patterns"""
216
+
217
+ # Base patterns
218
+ unwanted_patterns = [
219
+ # Navigation
220
+ r'<nav[^>]*>.*?</nav>',
221
+ r'<header[^>]*>.*?</header>',
222
+
223
+ # Footers
224
+ r'<footer[^>]*>.*?</footer>',
225
+
226
+ # Sidebars
227
+ r'<aside[^>]*>.*?</aside>',
228
+
229
+ # Ads
230
+ r'<div[^>]*class="[^"]*ad[^"]*"[^>]*>.*?</div>',
231
+ r'<ins[^>]*>.*?</ins>',
232
+
233
+ # Scripts and styles
234
+ r'<script[^>]*>.*?</script>',
235
+ r'<style[^>]*>.*?</style>',
236
+ r'<!--.*?-->',
237
+ ]
238
+
239
+ # Chinese-specific unwanted patterns
240
+ if is_chinese:
241
+ chinese_patterns = [
242
+ # Chinese navigation/menus (导航, 菜单)
243
+ r'<div[^>]*class="[^"]*(导航|菜单|nav)[^"]*"[^>]*>.*?</div>',
244
+ r'<ul[^>]*class="[^"]*(导航|菜单)[^"]*"[^>]*>.*?</ul>',
245
+
246
+ # Sidebars (侧边栏)
247
+ r'<div[^>]*class="[^"]*(侧边栏|sidebar)[^"]*"[^>]*>.*?</div>',
248
+
249
+ # Comments (评论)
250
+ r'<div[^>]*class="[^"]*(评论|comment)[^"]*"[^>]*>.*?</div>',
251
+
252
+ # Related articles (相关文章)
253
+ r'<div[^>]*class="[^"]*(相关|related)[^"]*"[^>]*>.*?</div>',
254
+
255
+ # Hot posts (热门)
256
+ r'<div[^>]*class="[^"]*(热门|hot)[^"]*"[^>]*>.*?</div>',
257
+
258
+ # Recommendations (推荐)
259
+ r'<div[^>]*class="[^"]*(推荐|recommend)[^"]*"[^>]*>.*?</div>',
260
+
261
+ # Share buttons (分享)
262
+ r'<div[^>]*class="[^"]*(分享|share)[^"]*"[^>]*>.*?</div>',
263
+ ]
264
+ unwanted_patterns.extend(chinese_patterns)
265
+
266
+ cleaned_html = html_content
267
+ for pattern in unwanted_patterns:
268
+ cleaned_html = re.sub(pattern, ' ', cleaned_html, flags=re.DOTALL | re.IGNORECASE)
269
+
270
+ return cleaned_html
271
+
272
+ def _extract_all_text(self, html_content: str, is_chinese: bool) -> str:
273
+ """Extract all text with Chinese character preservation"""
274
+ # Remove scripts, styles, comments
275
  html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
276
  html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
277
  html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
278
 
279
+ # Remove unwanted tags
280
+ unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'ins', 'meta', 'link']
281
  for tag in unwanted_tags:
282
  html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
283
 
284
+ # Remove HTML tags but preserve text
285
  text = re.sub(r'<[^>]+>', ' ', html_content)
286
 
287
  # Decode HTML entities
288
  text = html.unescape(text)
289
 
290
+ # Chinese-specific cleaning
291
+ if is_chinese:
292
+ # Keep Chinese text blocks
293
+ lines = text.split('\n')
294
+ filtered_lines = []
295
+ for line in lines:
296
+ line = line.strip()
297
+ # Keep lines with significant Chinese content
298
+ chinese_chars = self._count_chinese_characters(line)
299
+ if chinese_chars > 5 or len(line) > 50:
300
+ filtered_lines.append(line)
301
+ text = '\n\n'.join(filtered_lines)
302
+
303
+ return text
304
 
305
+ def _clean_content(self, content: str, is_chinese: bool) -> str:
306
+ """Clean content while preserving Chinese characters"""
307
  if not content:
308
  return ""
309
 
310
  # Replace multiple whitespace with single space
311
  content = re.sub(r'\s+', ' ', content)
312
 
313
+ # Remove control characters but preserve Chinese/Unicode
314
  content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
315
 
316
+ # Remove unwanted phrases (both English and Chinese)
317
  unwanted_phrases = [
318
+ # English
319
  r'sign up for our newsletter',
320
  r'subscribe to our newsletter',
321
  r'follow us on',
 
 
322
  r'share this article',
323
  r'read more',
324
  r'continue reading',
325
  r'advertisement',
 
 
 
326
  r'click here',
327
  r'learn more',
328
+
329
+ # Chinese
330
+ r'订阅我们的新闻',
331
+ r'关注我们',
332
+ r'分享这篇文章',
333
+ r'阅读更多',
334
+ r'继续阅读',
335
+ r'广告',
336
+ r'点击这里',
337
+ r'了解更多',
338
+ r'相关文章',
339
+ r'热门推荐',
340
+ r'猜你喜欢',
341
  ]
342
 
343
  for phrase in unwanted_phrases:
344
  content = re.sub(phrase, '', content, flags=re.IGNORECASE)
345
 
346
+ # Remove email addresses and URLs
347
  content = re.sub(r'\S+@\S+\.\S+', '', content)
 
 
348
  content = re.sub(r'https?://\S+', '', content)
349
 
350
+ # For Chinese content, clean differently
351
+ if is_chinese:
352
+ # Remove excessive punctuation but preserve Chinese punctuation
353
+ content = re.sub(r'[。!?]{3,}', '。', content)
354
+ content = re.sub(r'[\.,!?]{3,}', '.', content)
355
+
356
+ # Normalize Chinese punctuation spacing
357
+ content = re.sub(r'\s+([。,!?;:])', r'\1', content)
358
+ content = re.sub(r'([。,!?;:])\s+', r'\1', content)
359
+ else:
360
+ # Normalize English punctuation spacing
361
+ content = re.sub(r'\s+([.,!?;:])', r'\1', content)
362
+ content = re.sub(r'([.,!?;:])\s+', r'\1 ', content)
363
+
364
+ # Split and filter paragraphs
365
+ if is_chinese:
366
+ # Split by Chinese sentence endings
367
+ sentences = re.split(r'[。!?]', content)
368
+ else:
369
+ # Split by English sentence endings
370
+ sentences = re.split(r'[.!?]', content)
371
 
 
 
372
  clean_sentences = []
 
373
  for sentence in sentences:
374
  sentence = sentence.strip()
375
+ if not sentence:
376
  continue
377
 
378
+ # Keep sentences with meaningful content
379
+ if is_chinese:
380
+ chinese_chars = self._count_chinese_characters(sentence)
381
+ if chinese_chars > 3 or len(sentence) > 20:
382
+ clean_sentences.append(sentence)
383
+ else:
384
+ if len(sentence) > 20:
385
+ clean_sentences.append(sentence)
386
+
387
+ # Join back with appropriate punctuation
388
+ if is_chinese:
389
+ content = '。'.join(clean_sentences) + ('。' if clean_sentences else '')
390
+ else:
391
+ content = '. '.join(clean_sentences) + ('.' if clean_sentences else '')
392
 
393
  return content.strip()
394
 
395
+ def _extract_title(self, html_content: str, encoding: str) -> str:
396
+ """Extract page title with encoding support"""
397
+ title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
398
  if title_match:
399
  title = title_match.group(1)
400
  title = re.sub(r'\s+', ' ', title).strip()
401
+ # Ensure title is properly decoded
402
+ try:
403
+ title = html.unescape(title)
404
+ except:
405
+ pass
406
+ return title[:300]
407
+ return "未找到标题" if 'gb' in encoding or 'big5' in encoding else "No title found"
408
 
409
  def _extract_metadata(self, html_content: str) -> Dict[str, str]:
410
+ """Extract metadata including Chinese meta tags"""
411
  metadata = {}
412
 
413
+ # Meta description (supports both English and Chinese)
414
+ desc_patterns = [
415
+ r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
416
+ r'<meta[^>]*property=["\']og:description["\'][^>]*content=["\'](.*?)["\']',
417
+ ]
418
+
419
+ for pattern in desc_patterns:
420
+ match = re.search(pattern, html_content, re.IGNORECASE)
421
+ if match:
422
+ metadata['description'] = html.unescape(match.group(1))[:500]
423
+ break
424
 
425
+ # Keywords
426
  keywords_match = re.search(r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\'](.*?)["\']',
427
  html_content, re.IGNORECASE)
428
  if keywords_match:
429
+ metadata['keywords'] = html.unescape(keywords_match.group(1))[:500]
430
 
431
  # Author
432
  author_match = re.search(r'<meta[^>]*name=["\']author["\'][^>]*content=["\'](.*?)["\']',
433
  html_content, re.IGNORECASE)
434
  if author_match:
435
+ metadata['author'] = html.unescape(author_match.group(1))[:200]
436
+
437
+ # Charset
438
+ charset_match = re.search(r'<meta[^>]*charset=["\']([^"\']+)["\']', html_content, re.IGNORECASE)
439
+ if charset_match:
440
+ metadata['charset'] = charset_match.group(1)
441
 
442
  return metadata
443
+
444
+ def _count_chinese_characters(self, text: str) -> int:
445
+ """Count Chinese characters in text"""
446
+ # Chinese character ranges in Unicode
447
+ chinese_ranges = [
448
+ (0x4E00, 0x9FFF), # CJK Unified Ideographs
449
+ (0x3400, 0x4DBF), # CJK Unified Ideographs Extension A
450
+ (0x20000, 0x2A6DF), # CJK Unified Ideographs Extension B
451
+ (0x2A700, 0x2B73F), # CJK Unified Ideographs Extension C
452
+ (0x2B740, 0x2B81F), # CJK Unified Ideographs Extension D
453
+ (0x2B820, 0x2CEAF), # CJK Unified Ideographs Extension E
454
+ (0xF900, 0xFAFF), # CJK Compatibility Ideographs
455
+ (0x2F800, 0x2FA1F), # CJK Compatibility Ideographs Supplement
456
+ ]
457
+
458
+ count = 0
459
+ for char in text:
460
+ char_code = ord(char)
461
+ for start, end in chinese_ranges:
462
+ if start <= char_code <= end:
463
+ count += 1
464
+ break
465
+
466
+ return count
467
 
468
  # ==============================================
469
  # INITIALIZE
470
  # ==============================================
471
 
472
+ extractor = ChineseContentExtractor()
473
 
474
  # ==============================================
475
  # FASTAPI APP
 
477
 
478
  # Create FastAPI app
479
  fastapi_app = FastAPI(
480
+ title="智能内容提取器 (中文网站优化)",
481
+ description="专门优化中文网站的内容提取器,去除导航、广告、页脚等无关内容",
482
+ version="2.0"
483
  )
484
 
485
  # Add CORS middleware
 
497
  @fastapi_app.get("/")
498
  async def root():
499
  return {
500
+ "service": "智能内容提取器",
501
+ "version": "2.0",
502
+ "description": "专门优化中文网站的内容提取器",
503
  "endpoints": {
504
+ "GET /": "API信息",
505
+ "GET /health": "健康检查",
506
+ "POST /extract": "提取主要内容 (n8n专用)"
507
+ },
508
+ "examples": {
509
+ "中文网站": "https://zhihu.com",
510
+ "英文网站": "https://example.com"
511
  }
512
  }
513
 
 
520
 
521
  @fastapi_app.post("/extract")
522
  async def api_extract(request: Request):
523
+ """API endpoint for n8n - optimized for Chinese websites"""
524
  try:
525
  body = await request.json()
526
  url = body.get("url", "").strip()
 
528
  if not url:
529
  return JSONResponse(
530
  status_code=400,
531
+ content={"success": False, "error": "URL参数是必需的"}
532
  )
533
 
534
+ print(f"📨 内容提取请求: {url}")
535
  result = extractor.extract_content(url)
536
 
537
  return result
 
539
  except json.JSONDecodeError:
540
  return JSONResponse(
541
  status_code=400,
542
+ content={"success": False, "error": "无效的JSON数据"}
543
  )
544
  except Exception as e:
545
  return JSONResponse(
546
  status_code=500,
547
+ content={"success": False, "error": f"内部错误: {str(e)}"}
548
  )
549
 
550
  # ==============================================
 
554
  def gradio_extract(url: str):
555
  """Gradio interface function"""
556
  if not url:
557
+ return "❌ 请输入URL", {}
558
 
559
  result = extractor.extract_content(url)
560
 
561
  if result["success"]:
562
  content = result["main_content"]
563
  content_length = result["content_length"]
564
+ is_chinese = result.get("is_chinese_website", False)
565
 
566
  # Create preview
567
+ if is_chinese:
568
+ # For Chinese, show first 600 characters
569
+ preview = content[:600]
570
+ if len(content) > 600:
571
+ preview += "..."
572
+ else:
573
+ # For English, show first 500 characters
574
+ preview = content[:500]
575
+ if len(content) > 500:
576
+ preview += "..."
577
+
578
+ if is_chinese:
579
+ output = f"""
580
+ ## ✅ 内容提取成功!
581
+
582
+ **网址:** {result['url']}
583
+ **标题:** {result.get('title', '无标题')}
584
+ **时间:** {result['execution_time']}秒
585
+ **内容长度:** {content_length:,} 字符
586
+ **中文字符数:** {result.get('chinese_char_count', 0):,}
587
+
588
+ ### 内容预览:
589
+ {preview}
590
+ """
591
+ else:
592
+ output = f"""
593
  ## ✅ Content Extracted Successfully!
594
 
595
  **URL:** {result['url']}
596
+ **Title:** {result.get('title', 'No title')}
597
  **Time:** {result['execution_time']}s
598
  **Content Length:** {content_length:,} characters
599
 
 
602
  """
603
  return output, result
604
  else:
605
+ error_msg = result.get("error", "未知错误")
606
+ return f"## ❌ 错误\n\n{error_msg}", result
607
 
608
+ # Create Gradio interface
609
  gradio_interface = gr.Interface(
610
  fn=gradio_extract,
611
  inputs=gr.Textbox(
612
+ label="网站网址 / Website URL",
613
+ placeholder="请输入网址 (如: https://zhihu.com)",
614
+ value="https://zhihu.com"
615
  ),
616
  outputs=[
617
+ gr.Markdown(label="结果 / Result"),
618
+ gr.JSON(label="API响应 / API Response")
619
  ],
620
+ title="🧠 智能内容提取器 (中文优化) / Smart Content Extractor (Chinese Optimized)",
621
+ description="专门优化中文网站的内容提取器,去除导航、广告、页脚等无关内容 / Optimized for Chinese websites, removes navigation, ads, footers, etc.",
622
  examples=[
623
+ ["https://zhihu.com"],
624
+ ["https://baidu.com"],
625
+ ["https://news.sina.com.cn"],
626
  ["https://example.com"],
627
+ ["https://en.wikipedia.org/wiki/Artificial_intelligence"]
628
  ]
629
  )
630
 
 
641
 
642
  if __name__ == "__main__":
643
  print("\n" + "="*60)
644
+ print("🧠 智能内容提取器启动中...")
645
+ print("Smart Content Extractor Starting...")
646
  print("="*60)
647
+ print("API端点 / API Endpoint: POST /extract")
648
+ print("网页界面 / Web Interface: GET /")
649
  print("="*60 + "\n")
650
 
651
  uvicorn.run(