yukee1992 commited on
Commit
2448858
Β·
verified Β·
1 Parent(s): d982093

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -115
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==============================================
2
- # SIMPLE FREE CONTENT EXTRACTOR FOR N8N
3
  # ==============================================
4
 
5
  import gradio as gr
@@ -13,17 +13,17 @@ from fastapi import FastAPI, Request
13
  import uvicorn
14
 
15
  # ==============================================
16
- # SIMPLE CONTENT EXTRACTOR
17
  # ==============================================
18
 
19
- class SimpleContentExtractor:
20
- """Simple extractor using Jina Reader API + fallbacks"""
21
 
22
  def __init__(self):
23
  self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
24
 
25
  def extract_content(self, url: str) -> Dict[str, Any]:
26
- """Extract content using free APIs"""
27
  start_time = time.time()
28
 
29
  print(f"🌐 Extracting: {url}")
@@ -32,13 +32,15 @@ class SimpleContentExtractor:
32
  if not url.startswith(('http://', 'https://')):
33
  url = 'https://' + url
34
 
35
- # Try multiple strategies
36
  strategies = [
37
- self._try_jina_reader,
38
- self._try_direct_request,
39
- self._try_googlebot,
40
  ]
41
 
 
 
42
  for i, strategy in enumerate(strategies):
43
  try:
44
  print(f" Trying strategy {i+1}...")
@@ -50,37 +52,39 @@ class SimpleContentExtractor:
50
  return result
51
 
52
  except Exception as e:
 
53
  print(f" Strategy {i+1} failed: {e}")
54
- time.sleep(0.3) # Small delay
55
 
56
  # All failed
57
  return {
58
  "success": False,
59
  "url": url,
60
- "error": "Failed to extract content",
61
  "execution_time": round(time.time() - start_time, 2),
62
- "suggestion": "Try using Jina Reader directly: https://r.jina.ai/your-url"
63
  }
64
 
65
- def _try_jina_reader(self, url: str) -> Dict[str, Any]:
66
- """Try Jina Reader API (free, no API key, handles JavaScript)"""
67
  try:
68
- # Jina Reader endpoint
69
- api_url = f"https://r.jina.ai/{url}"
70
 
71
- # Try with different formats
72
- formats = [
73
- {"headers": {"Accept": "text/plain"}},
74
- {"headers": {"Accept": "application/json"}},
75
- {"url": f"https://r.jina.ai/{url}?format=json"},
76
  ]
77
 
78
- for fmt in formats:
79
  try:
80
- headers = fmt.get("headers", {"Accept": "text/plain", "User-Agent": self.user_agent})
81
- api_url_to_use = fmt.get("url", api_url)
82
-
83
- response = requests.get(api_url_to_use, headers=headers, timeout=30)
 
84
 
85
  if response.status_code == 200:
86
  content = response.text
@@ -93,8 +97,6 @@ class SimpleContentExtractor:
93
  content = data["content"]
94
  elif "data" in data:
95
  content = str(data["data"])
96
- elif "text" in data:
97
- content = data["text"]
98
  except:
99
  pass # Keep as plain text
100
 
@@ -107,44 +109,40 @@ class SimpleContentExtractor:
107
  return {
108
  "success": True,
109
  "url": url,
110
- "title": title[:300] if title else "Extracted via Jina Reader",
111
- "main_content": cleaned[:35000],
112
  "content_length": len(cleaned),
113
- "content_preview": cleaned[:1000] + ("..." if len(cleaned) > 1000 else ""),
114
  "source": "jina_reader",
115
- "note": "Content extracted via free Jina Reader API (handles JavaScript)",
116
  "status": response.status_code
117
  }
118
 
 
 
 
119
  except Exception as e:
120
- print(f" Jina format failed: {e}")
121
  continue
122
 
123
- return {"success": False, "error": f"Jina returned status: {response.status_code}"}
124
 
125
  except Exception as e:
126
- return {"success": False, "error": f"Jina API error: {str(e)}"}
127
 
128
  def _try_direct_request(self, url: str) -> Dict[str, Any]:
129
- """Try direct HTTP request with smart headers"""
130
  headers_list = [
131
- # Normal browser
132
  {
133
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
134
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
135
  "Accept-Language": "en-US,en;q=0.9",
136
- "Accept-Encoding": "gzip, deflate",
137
- "Connection": "keep-alive",
138
  },
139
- # Mobile browser
140
  {
141
  "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15",
142
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
143
- "Accept-Language": "en-US,en;q=0.9",
144
  },
145
- # Simple headers
146
  {
147
- "User-Agent": "Mozilla/5.0",
148
  "Accept": "text/html",
149
  },
150
  ]
@@ -163,12 +161,12 @@ class SimpleContentExtractor:
163
  # Extract title
164
  title = self._extract_title_from_html(html_content)
165
 
166
- if len(cleaned) > 100: # If we got meaningful content
167
  return {
168
  "success": True,
169
  "url": url,
170
- "title": title[:300] if title else "Extracted via direct request",
171
- "main_content": cleaned[:30000],
172
  "content_length": len(cleaned),
173
  "source": "direct_request",
174
  "status": response.status_code
@@ -180,16 +178,15 @@ class SimpleContentExtractor:
180
 
181
  return {"success": False}
182
 
183
- def _try_googlebot(self, url: str) -> Dict[str, Any]:
184
- """Pretend to be Googlebot"""
185
  try:
186
- headers = {
187
- "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
188
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
189
- "From": "googlebot(at)googlebot.com",
190
- }
191
-
192
- response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
193
 
194
  if response.status_code == 200:
195
  html_content = response.text
@@ -197,15 +194,14 @@ class SimpleContentExtractor:
197
  cleaned = self._clean_content(text_content)
198
  title = self._extract_title_from_html(html_content)
199
 
200
- if len(cleaned) > 100:
201
  return {
202
  "success": True,
203
  "url": url,
204
- "title": title[:300] if title else "Extracted as Googlebot",
205
- "main_content": cleaned[:30000],
206
  "content_length": len(cleaned),
207
- "source": "googlebot",
208
- "status": response.status_code
209
  }
210
 
211
  return {"success": False}
@@ -216,11 +212,11 @@ class SimpleContentExtractor:
216
  def _extract_from_html(self, html_content: str) -> str:
217
  """Extract text from HTML"""
218
  # Remove scripts and styles
219
- html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
220
- html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
221
  html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
222
 
223
- # Remove unwanted sections
224
  unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe']
225
  for tag in unwanted_tags:
226
  html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
@@ -229,7 +225,7 @@ class SimpleContentExtractor:
229
  text = re.sub(r'<[^>]+>', ' ', html_content)
230
  text = html.unescape(text)
231
 
232
- # Remove excessive whitespace
233
  text = re.sub(r'\s+', ' ', text)
234
 
235
  return text
@@ -245,16 +241,16 @@ class SimpleContentExtractor:
245
  return ""
246
 
247
  def _extract_title_from_text(self, text: str) -> str:
248
- """Try to extract title from plain text"""
249
  # Look for title patterns
250
  patterns = [
251
  r'Title:\s*(.*?)(?:\n|$)',
252
  r'#\s+(.*?)(?:\n|$)',
253
- r'^(.*?)(?:\n|$)',
254
  ]
255
 
256
  for pattern in patterns:
257
- match = re.search(pattern, text[:500], re.IGNORECASE)
258
  if match:
259
  title = match.group(1).strip()
260
  if len(title) > 10 and len(title) < 200:
@@ -284,37 +280,30 @@ class SimpleContentExtractor:
284
  r'sign up for',
285
  r'subscribe to',
286
  r'follow us on',
287
- r'like us on facebook',
288
  r'share this article',
289
  r'read more',
290
  r'continue reading',
291
- r'click here',
292
- r'learn more',
293
  ]
294
 
295
  for phrase in unwanted:
296
  content = re.sub(phrase, '', content, flags=re.IGNORECASE)
297
 
298
- # Remove email addresses and URLs
299
- content = re.sub(r'\S+@\S+\.\S+', '', content)
300
- content = re.sub(r'https?://\S+', '', content)
301
-
302
  return content.strip()
303
 
304
  # ==============================================
305
  # INITIALIZE
306
  # ==============================================
307
 
308
- extractor = SimpleContentExtractor()
309
 
310
  # ==============================================
311
  # FASTAPI APP
312
  # ==============================================
313
 
314
  fastapi_app = FastAPI(
315
- title="Free Content Extractor",
316
- description="Extracts content using free Jina Reader API and fallbacks",
317
- version="1.0"
318
  )
319
 
320
  from fastapi.middleware.cors import CORSMiddleware
@@ -331,25 +320,25 @@ fastapi_app.add_middleware(
331
  @fastapi_app.get("/")
332
  async def root():
333
  return {
334
- "service": "Free Content Extractor",
335
- "version": "1.0",
336
- "description": "Extracts website content using free Jina Reader API (handles JavaScript)",
337
  "endpoints": {
338
  "GET /": "This info",
339
- "GET /health": "Health check",
340
  "POST /extract": "Extract content"
341
  },
342
- "usage_n8n": {
343
- "method": "POST",
344
- "url": "https://your-space.hf.space/extract",
345
- "body": {"url": "https://example.com"}
346
- },
347
- "alternative": "Use Jina Reader directly: GET https://r.jina.ai/your-url"
348
  }
349
 
350
  @fastapi_app.get("/health")
351
  async def health():
352
- return {"status": "healthy", "timestamp": time.time()}
 
 
 
 
 
353
 
354
  @fastapi_app.post("/extract")
355
  async def api_extract(request: Request):
@@ -385,7 +374,7 @@ async def api_extract(request: Request):
385
  # ==============================================
386
 
387
  def gradio_extract(url: str):
388
- """Gradio interface function"""
389
  if not url:
390
  return "❌ Please enter a URL", {}
391
 
@@ -395,9 +384,8 @@ def gradio_extract(url: str):
395
  content = result["main_content"]
396
  content_length = result["content_length"]
397
 
398
- # Create preview
399
- preview = content[:800]
400
- if len(content) > 800:
401
  preview += "..."
402
 
403
  output = f"""
@@ -405,44 +393,36 @@ def gradio_extract(url: str):
405
 
406
  **URL:** {result['url']}
407
  **Title:** {result.get('title', 'N/A')}
408
- **Method:** {result.get('method', 'jina_reader')}
409
  **Time:** {result['execution_time']}s
410
- **Content Length:** {content_length:,} characters
411
 
412
  ### Preview:
413
  {preview}
414
-
415
- *Powered by free Jina Reader API*
416
  """
417
  return output, result
418
  else:
419
  error = result.get("error", "Unknown error")
420
- suggestion = result.get("suggestion", "")
421
-
422
- suggestion_text = f"\n\n{suggestion}" if suggestion else ""
423
-
424
- return f"## ❌ Error\n\n{error}{suggestion_text}", result
425
 
426
  # Create Gradio interface
427
  gradio_interface = gr.Interface(
428
  fn=gradio_extract,
429
  inputs=gr.Textbox(
430
  label="Website URL",
431
- placeholder="https://www.sinchew.com.my/",
432
- value="https://www.sinchew.com.my/"
433
  ),
434
  outputs=[
435
  gr.Markdown(label="Result"),
436
  gr.JSON(label="API Response")
437
  ],
438
- title="🌐 Free Content Extractor for n8n",
439
- description="Uses free Jina Reader API to extract content (handles JavaScript websites)",
440
  examples=[
441
- ["https://www.sinchew.com.my/"],
442
  ["https://example.com"],
443
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
444
- ["https://news.ycombinator.com"],
445
- ["https://zhihu.com"]
446
  ]
447
  )
448
 
@@ -458,13 +438,16 @@ app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
458
 
459
  if __name__ == "__main__":
460
  print("\n" + "="*60)
461
- print("🌐 Free Content Extractor Starting")
462
  print("="*60)
463
- print("Primary method: Jina Reader API")
464
- print("Secondary: Direct requests + Googlebot")
 
 
465
  print("="*60)
466
- print("API Endpoint: POST /extract")
467
- print("Direct Jina: GET https://r.jina.ai/your-url")
 
468
  print("="*60 + "\n")
469
 
470
  uvicorn.run(
 
1
  # ==============================================
2
+ # ROBUST CONTENT EXTRACTOR WITH BETTER ERROR HANDLING
3
  # ==============================================
4
 
5
  import gradio as gr
 
13
  import uvicorn
14
 
15
  # ==============================================
16
+ # IMPROVED CONTENT EXTRACTOR
17
  # ==============================================
18
 
19
+ class RobustContentExtractor:
20
+ """Content extractor with better timeout handling"""
21
 
22
  def __init__(self):
23
  self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
24
 
25
  def extract_content(self, url: str) -> Dict[str, Any]:
26
+ """Extract content with better error handling"""
27
  start_time = time.time()
28
 
29
  print(f"🌐 Extracting: {url}")
 
32
  if not url.startswith(('http://', 'https://')):
33
  url = 'https://' + url
34
 
35
+ # Try multiple strategies with shorter timeouts
36
  strategies = [
37
+ self._try_jina_reader_fast, # Faster timeout
38
+ self._try_direct_request, # Direct attempt
39
+ self._try_simple_request, # Simple headers
40
  ]
41
 
42
+ last_error = None
43
+
44
  for i, strategy in enumerate(strategies):
45
  try:
46
  print(f" Trying strategy {i+1}...")
 
52
  return result
53
 
54
  except Exception as e:
55
+ last_error = str(e)
56
  print(f" Strategy {i+1} failed: {e}")
57
+ time.sleep(0.5)
58
 
59
  # All failed
60
  return {
61
  "success": False,
62
  "url": url,
63
+ "error": f"All extraction methods failed. Last error: {last_error}",
64
  "execution_time": round(time.time() - start_time, 2),
65
+ "suggestion": "Website may block automated access. Try a different URL."
66
  }
67
 
68
+ def _try_jina_reader_fast(self, url: str) -> Dict[str, Any]:
69
+ """Try Jina Reader with shorter timeout"""
70
  try:
71
+ # Try with shorter timeout first
72
+ jina_url = f"https://r.jina.ai/{url}"
73
 
74
+ # Try multiple approaches
75
+ attempts = [
76
+ {"timeout": 15, "headers": {"Accept": "text/plain"}},
77
+ {"timeout": 20, "headers": {"Accept": "application/json"}},
78
+ {"timeout": 25, "headers": {"User-Agent": self.user_agent}},
79
  ]
80
 
81
+ for attempt in attempts:
82
  try:
83
+ response = requests.get(
84
+ jina_url,
85
+ headers=attempt["headers"],
86
+ timeout=attempt["timeout"]
87
+ )
88
 
89
  if response.status_code == 200:
90
  content = response.text
 
97
  content = data["content"]
98
  elif "data" in data:
99
  content = str(data["data"])
 
 
100
  except:
101
  pass # Keep as plain text
102
 
 
109
  return {
110
  "success": True,
111
  "url": url,
112
+ "title": title[:300] if title else "Extracted via Jina",
113
+ "main_content": cleaned[:25000],
114
  "content_length": len(cleaned),
115
+ "content_preview": cleaned[:800] + ("..." if len(cleaned) > 800 else ""),
116
  "source": "jina_reader",
 
117
  "status": response.status_code
118
  }
119
 
120
+ except requests.exceptions.Timeout:
121
+ print(f" Jina timeout after {attempt['timeout']}s, trying next...")
122
+ continue
123
  except Exception as e:
124
+ print(f" Jina attempt failed: {e}")
125
  continue
126
 
127
+ return {"success": False, "error": "Jina Reader timed out"}
128
 
129
  except Exception as e:
130
+ return {"success": False, "error": f"Jina error: {str(e)}"}
131
 
132
  def _try_direct_request(self, url: str) -> Dict[str, Any]:
133
+ """Try direct request with various headers"""
134
  headers_list = [
 
135
  {
136
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
137
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
138
  "Accept-Language": "en-US,en;q=0.9",
 
 
139
  },
 
140
  {
141
  "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15",
142
+ "Accept": "text/html",
 
143
  },
 
144
  {
145
+ "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
146
  "Accept": "text/html",
147
  },
148
  ]
 
161
  # Extract title
162
  title = self._extract_title_from_html(html_content)
163
 
164
+ if len(cleaned) > 100:
165
  return {
166
  "success": True,
167
  "url": url,
168
+ "title": title[:300] if title else "Direct extraction",
169
+ "main_content": cleaned[:20000],
170
  "content_length": len(cleaned),
171
  "source": "direct_request",
172
  "status": response.status_code
 
178
 
179
  return {"success": False}
180
 
181
+ def _try_simple_request(self, url: str) -> Dict[str, Any]:
182
+ """Simple request with minimal headers"""
183
  try:
184
+ response = requests.get(
185
+ url,
186
+ headers={"User-Agent": "Mozilla/5.0"},
187
+ timeout=8,
188
+ allow_redirects=True
189
+ )
 
190
 
191
  if response.status_code == 200:
192
  html_content = response.text
 
194
  cleaned = self._clean_content(text_content)
195
  title = self._extract_title_from_html(html_content)
196
 
197
+ if len(cleaned) > 50:
198
  return {
199
  "success": True,
200
  "url": url,
201
+ "title": title[:200] if title else "Simple extraction",
202
+ "main_content": cleaned[:15000],
203
  "content_length": len(cleaned),
204
+ "source": "simple_request"
 
205
  }
206
 
207
  return {"success": False}
 
212
  def _extract_from_html(self, html_content: str) -> str:
213
  """Extract text from HTML"""
214
  # Remove scripts and styles
215
+ html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
216
+ html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
217
  html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
218
 
219
+ # Remove unwanted tags
220
  unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe']
221
  for tag in unwanted_tags:
222
  html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
 
225
  text = re.sub(r'<[^>]+>', ' ', html_content)
226
  text = html.unescape(text)
227
 
228
+ # Clean up
229
  text = re.sub(r'\s+', ' ', text)
230
 
231
  return text
 
241
  return ""
242
 
243
  def _extract_title_from_text(self, text: str) -> str:
244
+ """Try to extract title from text"""
245
  # Look for title patterns
246
  patterns = [
247
  r'Title:\s*(.*?)(?:\n|$)',
248
  r'#\s+(.*?)(?:\n|$)',
249
+ r'<title[^>]*>(.*?)</title>',
250
  ]
251
 
252
  for pattern in patterns:
253
+ match = re.search(pattern, text[:1000], re.IGNORECASE)
254
  if match:
255
  title = match.group(1).strip()
256
  if len(title) > 10 and len(title) < 200:
 
280
  r'sign up for',
281
  r'subscribe to',
282
  r'follow us on',
 
283
  r'share this article',
284
  r'read more',
285
  r'continue reading',
 
 
286
  ]
287
 
288
  for phrase in unwanted:
289
  content = re.sub(phrase, '', content, flags=re.IGNORECASE)
290
 
 
 
 
 
291
  return content.strip()
292
 
293
  # ==============================================
294
  # INITIALIZE
295
  # ==============================================
296
 
297
+ extractor = RobustContentExtractor()
298
 
299
  # ==============================================
300
  # FASTAPI APP
301
  # ==============================================
302
 
303
  fastapi_app = FastAPI(
304
+ title="Robust Content Extractor",
305
+ description="Extracts content with better timeout handling",
306
+ version="2.0"
307
  )
308
 
309
  from fastapi.middleware.cors import CORSMiddleware
 
320
  @fastapi_app.get("/")
321
  async def root():
322
  return {
323
+ "service": "Robust Content Extractor",
324
+ "version": "2.0",
325
+ "description": "Extracts website content with better error handling",
326
  "endpoints": {
327
  "GET /": "This info",
328
+ "GET /health": "Health check (fast)",
329
  "POST /extract": "Extract content"
330
  },
331
+ "timeout_notes": "Jina Reader timeout reduced to 15-25 seconds for faster response"
 
 
 
 
 
332
  }
333
 
334
  @fastapi_app.get("/health")
335
  async def health():
336
+ """Fast health check endpoint for wake-up calls"""
337
+ return {
338
+ "status": "healthy",
339
+ "timestamp": time.time(),
340
+ "service": "content_extractor"
341
+ }
342
 
343
  @fastapi_app.post("/extract")
344
  async def api_extract(request: Request):
 
374
  # ==============================================
375
 
376
  def gradio_extract(url: str):
377
+ """Gradio interface"""
378
  if not url:
379
  return "❌ Please enter a URL", {}
380
 
 
384
  content = result["main_content"]
385
  content_length = result["content_length"]
386
 
387
+ preview = content[:500]
388
+ if len(content) > 500:
 
389
  preview += "..."
390
 
391
  output = f"""
 
393
 
394
  **URL:** {result['url']}
395
  **Title:** {result.get('title', 'N/A')}
396
+ **Method:** {result.get('method', 'extracted')}
397
  **Time:** {result['execution_time']}s
398
+ **Characters:** {content_length:,}
399
 
400
  ### Preview:
401
  {preview}
 
 
402
  """
403
  return output, result
404
  else:
405
  error = result.get("error", "Unknown error")
406
+ return f"## ❌ Error\n\n{error}", result
 
 
 
 
407
 
408
  # Create Gradio interface
409
  gradio_interface = gr.Interface(
410
  fn=gradio_extract,
411
  inputs=gr.Textbox(
412
  label="Website URL",
413
+ placeholder="https://example.com",
414
+ value="https://example.com"
415
  ),
416
  outputs=[
417
  gr.Markdown(label="Result"),
418
  gr.JSON(label="API Response")
419
  ],
420
+ title="🌐 Robust Content Extractor",
421
+ description="Extracts content with better error handling and timeouts",
422
  examples=[
 
423
  ["https://example.com"],
424
  ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
425
+ ["https://news.ycombinator.com"]
 
426
  ]
427
  )
428
 
 
438
 
439
  if __name__ == "__main__":
440
  print("\n" + "="*60)
441
+ print("🌐 Robust Content Extractor Starting")
442
  print("="*60)
443
+ print("Features:")
444
+ print("β€’ Faster timeouts (15-25s for Jina)")
445
+ print("β€’ Multiple fallback strategies")
446
+ print("β€’ Fast health endpoint for wake-up")
447
  print("="*60)
448
+ print("API Endpoints:")
449
+ print("β€’ GET /health - Fast health check (for wake-up)")
450
+ print("β€’ POST /extract - Extract content")
451
  print("="*60 + "\n")
452
 
453
  uvicorn.run(