Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# ==============================================
|
| 2 |
-
# FREE
|
| 3 |
# ==============================================
|
| 4 |
|
| 5 |
import gradio as gr
|
|
@@ -8,50 +8,19 @@ import json
|
|
| 8 |
import time
|
| 9 |
import re
|
| 10 |
import html
|
| 11 |
-
import base64
|
| 12 |
-
from io import BytesIO
|
| 13 |
-
from PIL import Image
|
| 14 |
-
import pytesseract # Free OCR
|
| 15 |
from typing import Dict, Any
|
| 16 |
from fastapi import FastAPI, Request
|
| 17 |
import uvicorn
|
| 18 |
|
| 19 |
# ==============================================
|
| 20 |
-
#
|
| 21 |
# ==============================================
|
| 22 |
|
| 23 |
-
class
|
| 24 |
-
"""
|
| 25 |
|
| 26 |
def __init__(self):
|
| 27 |
-
|
| 28 |
-
self.screenshot_apis = [
|
| 29 |
-
{
|
| 30 |
-
"url": lambda u: f"https://s0.wp.com/mshots/v1/{u}?w=1024",
|
| 31 |
-
"name": "wordpress_mshots"
|
| 32 |
-
},
|
| 33 |
-
{
|
| 34 |
-
"url": lambda u: f"https://render-tron.appspot.com/screenshot/{u}?width=1024&height=768",
|
| 35 |
-
"name": "render_tron"
|
| 36 |
-
},
|
| 37 |
-
{
|
| 38 |
-
"url": lambda u: f"https://image.thum.io/get/width/1024/crop/768/noanimate/{u}",
|
| 39 |
-
"name": "thumio"
|
| 40 |
-
},
|
| 41 |
-
]
|
| 42 |
-
|
| 43 |
-
# Free HTML content APIs
|
| 44 |
-
self.html_apis = [
|
| 45 |
-
{
|
| 46 |
-
"url": lambda u: f"https://r.jina.ai/{u}",
|
| 47 |
-
"name": "jina_reader",
|
| 48 |
-
"headers": {"Accept": "application/json"}
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"url": lambda u: f"https://extractorapi.com/api/v1/extractor?apikey=demo&url={u}",
|
| 52 |
-
"name": "extractor_api"
|
| 53 |
-
},
|
| 54 |
-
]
|
| 55 |
|
| 56 |
def extract_content(self, url: str) -> Dict[str, Any]:
|
| 57 |
"""Extract content using free APIs"""
|
|
@@ -63,267 +32,196 @@ class FreeScreenshotScraper:
|
|
| 63 |
if not url.startswith(('http://', 'https://')):
|
| 64 |
url = 'https://' + url
|
| 65 |
|
| 66 |
-
#
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
return jina_result
|
| 73 |
-
|
| 74 |
-
# Strategy 2: Try other HTML APIs
|
| 75 |
-
print(" Trying other HTML APIs...")
|
| 76 |
-
for api in self.html_apis[1:]:
|
| 77 |
-
result = self._try_api(api, url)
|
| 78 |
-
if result["success"]:
|
| 79 |
-
result["execution_time"] = round(time.time() - start_time, 2)
|
| 80 |
-
result["method"] = api["name"]
|
| 81 |
-
return result
|
| 82 |
-
|
| 83 |
-
# Strategy 3: Try direct request with smart headers
|
| 84 |
-
print(" Trying direct request...")
|
| 85 |
-
direct_result = self._try_direct_request(url)
|
| 86 |
-
if direct_result["success"]:
|
| 87 |
-
direct_result["execution_time"] = round(time.time() - start_time, 2)
|
| 88 |
-
direct_result["method"] = "direct_with_fallback"
|
| 89 |
-
return direct_result
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
result
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# All failed
|
| 101 |
return {
|
| 102 |
"success": False,
|
| 103 |
"url": url,
|
| 104 |
-
"error": "
|
| 105 |
"execution_time": round(time.time() - start_time, 2),
|
| 106 |
-
"
|
| 107 |
-
"Try a different URL",
|
| 108 |
-
"Website may block automated access",
|
| 109 |
-
"Try using Jina Reader directly: https://r.jina.ai/your-url"
|
| 110 |
-
]
|
| 111 |
}
|
| 112 |
|
| 113 |
def _try_jina_reader(self, url: str) -> Dict[str, Any]:
|
| 114 |
-
"""Try Jina Reader API (free, no API key
|
| 115 |
try:
|
|
|
|
| 116 |
api_url = f"https://r.jina.ai/{url}"
|
| 117 |
-
headers = {
|
| 118 |
-
"User-Agent": "Mozilla/5.0",
|
| 119 |
-
"Accept": "application/json",
|
| 120 |
-
}
|
| 121 |
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
-
|
| 125 |
-
# Jina returns clean text directly
|
| 126 |
-
content = response.text
|
| 127 |
-
|
| 128 |
-
# Try to parse as JSON first
|
| 129 |
try:
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
-
return {"success": False, "error": f"Jina
|
| 156 |
|
| 157 |
except Exception as e:
|
| 158 |
return {"success": False, "error": f"Jina API error: {str(e)}"}
|
| 159 |
|
| 160 |
-
def _try_api(self, api: dict, url: str) -> Dict[str, Any]:
|
| 161 |
-
"""Try other free APIs"""
|
| 162 |
-
try:
|
| 163 |
-
api_url = api["url"](url)
|
| 164 |
-
headers = api.get("headers", {"User-Agent": "Mozilla/5.0"})
|
| 165 |
-
|
| 166 |
-
response = requests.get(api_url, headers=headers, timeout=15)
|
| 167 |
-
|
| 168 |
-
if response.status_code == 200:
|
| 169 |
-
content = response.text
|
| 170 |
-
|
| 171 |
-
# Try to parse JSON
|
| 172 |
-
try:
|
| 173 |
-
data = json.loads(content)
|
| 174 |
-
# Extract content from common API formats
|
| 175 |
-
if "text" in data:
|
| 176 |
-
content = data["text"]
|
| 177 |
-
elif "content" in data:
|
| 178 |
-
content = data["content"]
|
| 179 |
-
elif "article" in data:
|
| 180 |
-
content = data["article"]
|
| 181 |
-
except:
|
| 182 |
-
pass
|
| 183 |
-
|
| 184 |
-
cleaned = self._clean_content(content)
|
| 185 |
-
|
| 186 |
-
return {
|
| 187 |
-
"success": True,
|
| 188 |
-
"url": url,
|
| 189 |
-
"main_content": cleaned[:20000],
|
| 190 |
-
"content_length": len(cleaned)
|
| 191 |
-
}
|
| 192 |
-
|
| 193 |
-
return {"success": False}
|
| 194 |
-
|
| 195 |
-
except:
|
| 196 |
-
return {"success": False}
|
| 197 |
-
|
| 198 |
def _try_direct_request(self, url: str) -> Dict[str, Any]:
|
| 199 |
-
"""Try direct request with
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
]
|
| 205 |
|
| 206 |
-
for
|
| 207 |
try:
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
continue
|
| 213 |
|
| 214 |
return {"success": False}
|
| 215 |
|
| 216 |
-
def
|
| 217 |
-
"""Direct request with browser-like headers"""
|
| 218 |
-
headers = {
|
| 219 |
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
| 220 |
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 221 |
-
"Accept-Language": "en-US,en;q=0.5",
|
| 222 |
-
"Accept-Encoding": "gzip, deflate",
|
| 223 |
-
"Connection": "keep-alive",
|
| 224 |
-
"Upgrade-Insecure-Requests": "1",
|
| 225 |
-
"Cache-Control": "max-age=0",
|
| 226 |
-
}
|
| 227 |
-
|
| 228 |
-
response = requests.get(url, headers=headers, timeout=10)
|
| 229 |
-
|
| 230 |
-
if response.status_code == 200:
|
| 231 |
-
content = self._extract_from_html(response.text)
|
| 232 |
-
cleaned = self._clean_content(content)
|
| 233 |
-
|
| 234 |
-
return {
|
| 235 |
-
"success": True,
|
| 236 |
-
"content": cleaned
|
| 237 |
-
}
|
| 238 |
-
|
| 239 |
-
return {"success": False}
|
| 240 |
-
|
| 241 |
-
def _direct_request_as_googlebot(self, url: str) -> Dict[str, Any]:
|
| 242 |
"""Pretend to be Googlebot"""
|
| 243 |
-
headers = {
|
| 244 |
-
"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
|
| 245 |
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 246 |
-
}
|
| 247 |
-
|
| 248 |
-
response = requests.get(url, headers=headers, timeout=10)
|
| 249 |
-
|
| 250 |
-
if response.status_code == 200:
|
| 251 |
-
content = self._extract_from_html(response.text)
|
| 252 |
-
cleaned = self._clean_content(content)
|
| 253 |
-
|
| 254 |
-
return {
|
| 255 |
-
"success": True,
|
| 256 |
-
"content": cleaned
|
| 257 |
-
}
|
| 258 |
-
|
| 259 |
-
return {"success": False}
|
| 260 |
-
|
| 261 |
-
def _direct_request_with_referer(self, url: str) -> Dict[str, Any]:
|
| 262 |
-
"""Request with referer"""
|
| 263 |
-
headers = {
|
| 264 |
-
"User-Agent": "Mozilla/5.0",
|
| 265 |
-
"Referer": "https://www.google.com/",
|
| 266 |
-
"Accept": "text/html",
|
| 267 |
-
}
|
| 268 |
-
|
| 269 |
-
response = requests.get(url, headers=headers, timeout=10)
|
| 270 |
-
|
| 271 |
-
if response.status_code == 200:
|
| 272 |
-
content = self._extract_from_html(response.text)
|
| 273 |
-
cleaned = self._clean_content(content)
|
| 274 |
-
|
| 275 |
-
return {
|
| 276 |
-
"success": True,
|
| 277 |
-
"content": cleaned
|
| 278 |
-
}
|
| 279 |
-
|
| 280 |
-
return {"success": False}
|
| 281 |
-
|
| 282 |
-
def _try_screenshot_api(self, api: dict, url: str) -> Dict[str, Any]:
|
| 283 |
-
"""Try screenshot API"""
|
| 284 |
try:
|
| 285 |
-
|
| 286 |
-
|
|
|
|
|
|
|
|
|
|
| 287 |
|
| 288 |
-
response = requests.get(
|
| 289 |
|
| 290 |
-
if response.status_code == 200
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
"note": "Content extracted from screenshot via OCR"
|
| 307 |
-
}
|
| 308 |
-
except:
|
| 309 |
-
return {"success": False, "error": "OCR not available"}
|
| 310 |
-
|
| 311 |
-
except:
|
| 312 |
-
return {"success": False}
|
| 313 |
|
| 314 |
return {"success": False}
|
| 315 |
|
| 316 |
-
except:
|
| 317 |
-
return {"success": False}
|
| 318 |
|
| 319 |
def _extract_from_html(self, html_content: str) -> str:
|
| 320 |
"""Extract text from HTML"""
|
| 321 |
# Remove scripts and styles
|
| 322 |
-
html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL)
|
| 323 |
-
html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL)
|
|
|
|
| 324 |
|
| 325 |
-
# Remove unwanted
|
| 326 |
-
unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu']
|
| 327 |
for tag in unwanted_tags:
|
| 328 |
html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
|
| 329 |
|
|
@@ -331,10 +229,41 @@ class FreeScreenshotScraper:
|
|
| 331 |
text = re.sub(r'<[^>]+>', ' ', html_content)
|
| 332 |
text = html.unescape(text)
|
| 333 |
|
|
|
|
|
|
|
|
|
|
| 334 |
return text
|
| 335 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
def _clean_content(self, content: str) -> str:
|
| 337 |
-
"""Clean content"""
|
| 338 |
if not content:
|
| 339 |
return ""
|
| 340 |
|
|
@@ -347,13 +276,36 @@ class FreeScreenshotScraper:
|
|
| 347 |
# Remove excessive line breaks
|
| 348 |
content = re.sub(r'\n{3,}', '\n\n', content)
|
| 349 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
return content.strip()
|
| 351 |
|
| 352 |
# ==============================================
|
| 353 |
# INITIALIZE
|
| 354 |
# ==============================================
|
| 355 |
|
| 356 |
-
|
| 357 |
|
| 358 |
# ==============================================
|
| 359 |
# FASTAPI APP
|
|
@@ -361,7 +313,7 @@ scraper = FreeScreenshotScraper()
|
|
| 361 |
|
| 362 |
fastapi_app = FastAPI(
|
| 363 |
title="Free Content Extractor",
|
| 364 |
-
description="
|
| 365 |
version="1.0"
|
| 366 |
)
|
| 367 |
|
|
@@ -381,18 +333,18 @@ async def root():
|
|
| 381 |
return {
|
| 382 |
"service": "Free Content Extractor",
|
| 383 |
"version": "1.0",
|
| 384 |
-
"description": "
|
| 385 |
"endpoints": {
|
| 386 |
"GET /": "This info",
|
| 387 |
"GET /health": "Health check",
|
| 388 |
-
"POST /extract": "Extract content
|
| 389 |
},
|
| 390 |
-
"
|
| 391 |
-
"
|
| 392 |
-
"
|
| 393 |
-
"
|
| 394 |
-
|
| 395 |
-
|
| 396 |
}
|
| 397 |
|
| 398 |
@fastapi_app.get("/health")
|
|
@@ -412,8 +364,8 @@ async def api_extract(request: Request):
|
|
| 412 |
content={"success": False, "error": "URL is required"}
|
| 413 |
)
|
| 414 |
|
| 415 |
-
print(f"π¨ Request: {url}")
|
| 416 |
-
result =
|
| 417 |
|
| 418 |
return result
|
| 419 |
|
|
@@ -433,16 +385,17 @@ async def api_extract(request: Request):
|
|
| 433 |
# ==============================================
|
| 434 |
|
| 435 |
def gradio_extract(url: str):
|
| 436 |
-
"""Gradio interface"""
|
| 437 |
if not url:
|
| 438 |
return "β Please enter a URL", {}
|
| 439 |
|
| 440 |
-
result =
|
| 441 |
|
| 442 |
if result["success"]:
|
| 443 |
content = result["main_content"]
|
| 444 |
content_length = result["content_length"]
|
| 445 |
|
|
|
|
| 446 |
preview = content[:800]
|
| 447 |
if len(content) > 800:
|
| 448 |
preview += "..."
|
|
@@ -451,23 +404,22 @@ def gradio_extract(url: str):
|
|
| 451 |
## β
Success!
|
| 452 |
|
| 453 |
**URL:** {result['url']}
|
| 454 |
-
**
|
|
|
|
| 455 |
**Time:** {result['execution_time']}s
|
| 456 |
**Content Length:** {content_length:,} characters
|
| 457 |
|
| 458 |
### Preview:
|
| 459 |
{preview}
|
| 460 |
|
| 461 |
-
*
|
| 462 |
"""
|
| 463 |
return output, result
|
| 464 |
else:
|
| 465 |
error = result.get("error", "Unknown error")
|
| 466 |
-
|
| 467 |
|
| 468 |
-
suggestion_text = ""
|
| 469 |
-
if suggestions:
|
| 470 |
-
suggestion_text = "\n\n**Suggestions:**\n" + "\n".join([f"β’ {s}" for s in suggestions])
|
| 471 |
|
| 472 |
return f"## β Error\n\n{error}{suggestion_text}", result
|
| 473 |
|
|
@@ -484,12 +436,13 @@ gradio_interface = gr.Interface(
|
|
| 484 |
gr.JSON(label="API Response")
|
| 485 |
],
|
| 486 |
title="π Free Content Extractor for n8n",
|
| 487 |
-
description="Uses free
|
| 488 |
examples=[
|
| 489 |
["https://www.sinchew.com.my/"],
|
| 490 |
["https://example.com"],
|
| 491 |
["https://en.wikipedia.org/wiki/Artificial_intelligence"],
|
| 492 |
-
["https://news.ycombinator.com"]
|
|
|
|
| 493 |
]
|
| 494 |
)
|
| 495 |
|
|
@@ -507,12 +460,11 @@ if __name__ == "__main__":
|
|
| 507 |
print("\n" + "="*60)
|
| 508 |
print("π Free Content Extractor Starting")
|
| 509 |
print("="*60)
|
| 510 |
-
print("
|
| 511 |
-
print("
|
| 512 |
-
print("β’ WordPress mShots")
|
| 513 |
-
print("β’ Render-Tron")
|
| 514 |
print("="*60)
|
| 515 |
print("API Endpoint: POST /extract")
|
|
|
|
| 516 |
print("="*60 + "\n")
|
| 517 |
|
| 518 |
uvicorn.run(
|
|
|
|
| 1 |
# ==============================================
|
| 2 |
+
# SIMPLE FREE CONTENT EXTRACTOR FOR N8N
|
| 3 |
# ==============================================
|
| 4 |
|
| 5 |
import gradio as gr
|
|
|
|
| 8 |
import time
|
| 9 |
import re
|
| 10 |
import html
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
from typing import Dict, Any
|
| 12 |
from fastapi import FastAPI, Request
|
| 13 |
import uvicorn
|
| 14 |
|
| 15 |
# ==============================================
|
| 16 |
+
# SIMPLE CONTENT EXTRACTOR
|
| 17 |
# ==============================================
|
| 18 |
|
| 19 |
+
class SimpleContentExtractor:
|
| 20 |
+
"""Simple extractor using Jina Reader API + fallbacks"""
|
| 21 |
|
| 22 |
def __init__(self):
|
| 23 |
+
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def extract_content(self, url: str) -> Dict[str, Any]:
|
| 26 |
"""Extract content using free APIs"""
|
|
|
|
| 32 |
if not url.startswith(('http://', 'https://')):
|
| 33 |
url = 'https://' + url
|
| 34 |
|
| 35 |
+
# Try multiple strategies
|
| 36 |
+
strategies = [
|
| 37 |
+
self._try_jina_reader,
|
| 38 |
+
self._try_direct_request,
|
| 39 |
+
self._try_googlebot,
|
| 40 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
for i, strategy in enumerate(strategies):
|
| 43 |
+
try:
|
| 44 |
+
print(f" Trying strategy {i+1}...")
|
| 45 |
+
result = strategy(url)
|
| 46 |
+
|
| 47 |
+
if result.get("success"):
|
| 48 |
+
result["execution_time"] = round(time.time() - start_time, 2)
|
| 49 |
+
result["method"] = f"strategy_{i+1}"
|
| 50 |
+
return result
|
| 51 |
+
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(f" Strategy {i+1} failed: {e}")
|
| 54 |
+
time.sleep(0.3) # Small delay
|
| 55 |
|
| 56 |
# All failed
|
| 57 |
return {
|
| 58 |
"success": False,
|
| 59 |
"url": url,
|
| 60 |
+
"error": "Failed to extract content",
|
| 61 |
"execution_time": round(time.time() - start_time, 2),
|
| 62 |
+
"suggestion": "Try using Jina Reader directly: https://r.jina.ai/your-url"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
|
| 65 |
def _try_jina_reader(self, url: str) -> Dict[str, Any]:
|
| 66 |
+
"""Try Jina Reader API (free, no API key, handles JavaScript)"""
|
| 67 |
try:
|
| 68 |
+
# Jina Reader endpoint
|
| 69 |
api_url = f"https://r.jina.ai/{url}"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
# Try with different formats
|
| 72 |
+
formats = [
|
| 73 |
+
{"headers": {"Accept": "text/plain"}},
|
| 74 |
+
{"headers": {"Accept": "application/json"}},
|
| 75 |
+
{"url": f"https://r.jina.ai/{url}?format=json"},
|
| 76 |
+
]
|
| 77 |
|
| 78 |
+
for fmt in formats:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
try:
|
| 80 |
+
headers = fmt.get("headers", {"Accept": "text/plain", "User-Agent": self.user_agent})
|
| 81 |
+
api_url_to_use = fmt.get("url", api_url)
|
| 82 |
+
|
| 83 |
+
response = requests.get(api_url_to_use, headers=headers, timeout=30)
|
| 84 |
+
|
| 85 |
+
if response.status_code == 200:
|
| 86 |
+
content = response.text
|
| 87 |
+
|
| 88 |
+
# Try to parse as JSON
|
| 89 |
+
try:
|
| 90 |
+
data = json.loads(content)
|
| 91 |
+
if isinstance(data, dict):
|
| 92 |
+
if "content" in data:
|
| 93 |
+
content = data["content"]
|
| 94 |
+
elif "data" in data:
|
| 95 |
+
content = str(data["data"])
|
| 96 |
+
elif "text" in data:
|
| 97 |
+
content = data["text"]
|
| 98 |
+
except:
|
| 99 |
+
pass # Keep as plain text
|
| 100 |
+
|
| 101 |
+
# Extract title
|
| 102 |
+
title = self._extract_title_from_text(content)
|
| 103 |
+
|
| 104 |
+
# Clean content
|
| 105 |
+
cleaned = self._clean_content(content)
|
| 106 |
+
|
| 107 |
+
return {
|
| 108 |
+
"success": True,
|
| 109 |
+
"url": url,
|
| 110 |
+
"title": title[:300] if title else "Extracted via Jina Reader",
|
| 111 |
+
"main_content": cleaned[:35000],
|
| 112 |
+
"content_length": len(cleaned),
|
| 113 |
+
"content_preview": cleaned[:1000] + ("..." if len(cleaned) > 1000 else ""),
|
| 114 |
+
"source": "jina_reader",
|
| 115 |
+
"note": "Content extracted via free Jina Reader API (handles JavaScript)",
|
| 116 |
+
"status": response.status_code
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
print(f" Jina format failed: {e}")
|
| 121 |
+
continue
|
| 122 |
|
| 123 |
+
return {"success": False, "error": f"Jina returned status: {response.status_code}"}
|
| 124 |
|
| 125 |
except Exception as e:
|
| 126 |
return {"success": False, "error": f"Jina API error: {str(e)}"}
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
def _try_direct_request(self, url: str) -> Dict[str, Any]:
|
| 129 |
+
"""Try direct HTTP request with smart headers"""
|
| 130 |
+
headers_list = [
|
| 131 |
+
# Normal browser
|
| 132 |
+
{
|
| 133 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
| 134 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 135 |
+
"Accept-Language": "en-US,en;q=0.9",
|
| 136 |
+
"Accept-Encoding": "gzip, deflate",
|
| 137 |
+
"Connection": "keep-alive",
|
| 138 |
+
},
|
| 139 |
+
# Mobile browser
|
| 140 |
+
{
|
| 141 |
+
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15",
|
| 142 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 143 |
+
"Accept-Language": "en-US,en;q=0.9",
|
| 144 |
+
},
|
| 145 |
+
# Simple headers
|
| 146 |
+
{
|
| 147 |
+
"User-Agent": "Mozilla/5.0",
|
| 148 |
+
"Accept": "text/html",
|
| 149 |
+
},
|
| 150 |
]
|
| 151 |
|
| 152 |
+
for headers in headers_list:
|
| 153 |
try:
|
| 154 |
+
response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
|
| 155 |
+
|
| 156 |
+
if response.status_code == 200:
|
| 157 |
+
html_content = response.text
|
| 158 |
+
|
| 159 |
+
# Extract content
|
| 160 |
+
text_content = self._extract_from_html(html_content)
|
| 161 |
+
cleaned = self._clean_content(text_content)
|
| 162 |
+
|
| 163 |
+
# Extract title
|
| 164 |
+
title = self._extract_title_from_html(html_content)
|
| 165 |
+
|
| 166 |
+
if len(cleaned) > 100: # If we got meaningful content
|
| 167 |
+
return {
|
| 168 |
+
"success": True,
|
| 169 |
+
"url": url,
|
| 170 |
+
"title": title[:300] if title else "Extracted via direct request",
|
| 171 |
+
"main_content": cleaned[:30000],
|
| 172 |
+
"content_length": len(cleaned),
|
| 173 |
+
"source": "direct_request",
|
| 174 |
+
"status": response.status_code
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
except Exception as e:
|
| 178 |
+
print(f" Direct request failed: {e}")
|
| 179 |
continue
|
| 180 |
|
| 181 |
return {"success": False}
|
| 182 |
|
| 183 |
+
def _try_googlebot(self, url: str) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
"""Pretend to be Googlebot"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
try:
|
| 186 |
+
headers = {
|
| 187 |
+
"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
|
| 188 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 189 |
+
"From": "googlebot(at)googlebot.com",
|
| 190 |
+
}
|
| 191 |
|
| 192 |
+
response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
|
| 193 |
|
| 194 |
+
if response.status_code == 200:
|
| 195 |
+
html_content = response.text
|
| 196 |
+
text_content = self._extract_from_html(html_content)
|
| 197 |
+
cleaned = self._clean_content(text_content)
|
| 198 |
+
title = self._extract_title_from_html(html_content)
|
| 199 |
+
|
| 200 |
+
if len(cleaned) > 100:
|
| 201 |
+
return {
|
| 202 |
+
"success": True,
|
| 203 |
+
"url": url,
|
| 204 |
+
"title": title[:300] if title else "Extracted as Googlebot",
|
| 205 |
+
"main_content": cleaned[:30000],
|
| 206 |
+
"content_length": len(cleaned),
|
| 207 |
+
"source": "googlebot",
|
| 208 |
+
"status": response.status_code
|
| 209 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
return {"success": False}
|
| 212 |
|
| 213 |
+
except Exception as e:
|
| 214 |
+
return {"success": False, "error": str(e)}
|
| 215 |
|
| 216 |
def _extract_from_html(self, html_content: str) -> str:
|
| 217 |
"""Extract text from HTML"""
|
| 218 |
# Remove scripts and styles
|
| 219 |
+
html_content = re.sub(r'<script[^>]*>.*?</script>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
|
| 220 |
+
html_content = re.sub(r'<style[^>]*>.*?</style>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
|
| 221 |
+
html_content = re.sub(r'<!--.*?-->', ' ', html_content, flags=re.DOTALL)
|
| 222 |
|
| 223 |
+
# Remove unwanted sections
|
| 224 |
+
unwanted_tags = ['nav', 'header', 'footer', 'aside', 'menu', 'form', 'iframe']
|
| 225 |
for tag in unwanted_tags:
|
| 226 |
html_content = re.sub(f'<{tag}[^>]*>.*?</{tag}>', ' ', html_content, flags=re.DOTALL | re.IGNORECASE)
|
| 227 |
|
|
|
|
| 229 |
text = re.sub(r'<[^>]+>', ' ', html_content)
|
| 230 |
text = html.unescape(text)
|
| 231 |
|
| 232 |
+
# Remove excessive whitespace
|
| 233 |
+
text = re.sub(r'\s+', ' ', text)
|
| 234 |
+
|
| 235 |
return text
|
| 236 |
|
| 237 |
+
def _extract_title_from_html(self, html_content: str) -> str:
|
| 238 |
+
"""Extract title from HTML"""
|
| 239 |
+
title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE)
|
| 240 |
+
if title_match:
|
| 241 |
+
title = title_match.group(1)
|
| 242 |
+
title = re.sub(r'\s+', ' ', title).strip()
|
| 243 |
+
title = html.unescape(title)
|
| 244 |
+
return title
|
| 245 |
+
return ""
|
| 246 |
+
|
| 247 |
+
def _extract_title_from_text(self, text: str) -> str:
|
| 248 |
+
"""Try to extract title from plain text"""
|
| 249 |
+
# Look for title patterns
|
| 250 |
+
patterns = [
|
| 251 |
+
r'Title:\s*(.*?)(?:\n|$)',
|
| 252 |
+
r'#\s+(.*?)(?:\n|$)',
|
| 253 |
+
r'^(.*?)(?:\n|$)',
|
| 254 |
+
]
|
| 255 |
+
|
| 256 |
+
for pattern in patterns:
|
| 257 |
+
match = re.search(pattern, text[:500], re.IGNORECASE)
|
| 258 |
+
if match:
|
| 259 |
+
title = match.group(1).strip()
|
| 260 |
+
if len(title) > 10 and len(title) < 200:
|
| 261 |
+
return title
|
| 262 |
+
|
| 263 |
+
return ""
|
| 264 |
+
|
| 265 |
def _clean_content(self, content: str) -> str:
|
| 266 |
+
"""Clean and normalize content"""
|
| 267 |
if not content:
|
| 268 |
return ""
|
| 269 |
|
|
|
|
| 276 |
# Remove excessive line breaks
|
| 277 |
content = re.sub(r'\n{3,}', '\n\n', content)
|
| 278 |
|
| 279 |
+
# Remove common unwanted phrases
|
| 280 |
+
unwanted = [
|
| 281 |
+
r'adsbygoogle',
|
| 282 |
+
r'advertisement',
|
| 283 |
+
r'sponsored content',
|
| 284 |
+
r'sign up for',
|
| 285 |
+
r'subscribe to',
|
| 286 |
+
r'follow us on',
|
| 287 |
+
r'like us on facebook',
|
| 288 |
+
r'share this article',
|
| 289 |
+
r'read more',
|
| 290 |
+
r'continue reading',
|
| 291 |
+
r'click here',
|
| 292 |
+
r'learn more',
|
| 293 |
+
]
|
| 294 |
+
|
| 295 |
+
for phrase in unwanted:
|
| 296 |
+
content = re.sub(phrase, '', content, flags=re.IGNORECASE)
|
| 297 |
+
|
| 298 |
+
# Remove email addresses and URLs
|
| 299 |
+
content = re.sub(r'\S+@\S+\.\S+', '', content)
|
| 300 |
+
content = re.sub(r'https?://\S+', '', content)
|
| 301 |
+
|
| 302 |
return content.strip()
|
| 303 |
|
| 304 |
# ==============================================
|
| 305 |
# INITIALIZE
|
| 306 |
# ==============================================
|
| 307 |
|
| 308 |
+
extractor = SimpleContentExtractor()
|
| 309 |
|
| 310 |
# ==============================================
|
| 311 |
# FASTAPI APP
|
|
|
|
| 313 |
|
| 314 |
fastapi_app = FastAPI(
|
| 315 |
title="Free Content Extractor",
|
| 316 |
+
description="Extracts content using free Jina Reader API and fallbacks",
|
| 317 |
version="1.0"
|
| 318 |
)
|
| 319 |
|
|
|
|
| 333 |
return {
|
| 334 |
"service": "Free Content Extractor",
|
| 335 |
"version": "1.0",
|
| 336 |
+
"description": "Extracts website content using free Jina Reader API (handles JavaScript)",
|
| 337 |
"endpoints": {
|
| 338 |
"GET /": "This info",
|
| 339 |
"GET /health": "Health check",
|
| 340 |
+
"POST /extract": "Extract content"
|
| 341 |
},
|
| 342 |
+
"usage_n8n": {
|
| 343 |
+
"method": "POST",
|
| 344 |
+
"url": "https://your-space.hf.space/extract",
|
| 345 |
+
"body": {"url": "https://example.com"}
|
| 346 |
+
},
|
| 347 |
+
"alternative": "Use Jina Reader directly: GET https://r.jina.ai/your-url"
|
| 348 |
}
|
| 349 |
|
| 350 |
@fastapi_app.get("/health")
|
|
|
|
| 364 |
content={"success": False, "error": "URL is required"}
|
| 365 |
)
|
| 366 |
|
| 367 |
+
print(f"π¨ API Request: {url}")
|
| 368 |
+
result = extractor.extract_content(url)
|
| 369 |
|
| 370 |
return result
|
| 371 |
|
|
|
|
| 385 |
# ==============================================
|
| 386 |
|
| 387 |
def gradio_extract(url: str):
|
| 388 |
+
"""Gradio interface function"""
|
| 389 |
if not url:
|
| 390 |
return "β Please enter a URL", {}
|
| 391 |
|
| 392 |
+
result = extractor.extract_content(url)
|
| 393 |
|
| 394 |
if result["success"]:
|
| 395 |
content = result["main_content"]
|
| 396 |
content_length = result["content_length"]
|
| 397 |
|
| 398 |
+
# Create preview
|
| 399 |
preview = content[:800]
|
| 400 |
if len(content) > 800:
|
| 401 |
preview += "..."
|
|
|
|
| 404 |
## β
Success!
|
| 405 |
|
| 406 |
**URL:** {result['url']}
|
| 407 |
+
**Title:** {result.get('title', 'N/A')}
|
| 408 |
+
**Method:** {result.get('method', 'jina_reader')}
|
| 409 |
**Time:** {result['execution_time']}s
|
| 410 |
**Content Length:** {content_length:,} characters
|
| 411 |
|
| 412 |
### Preview:
|
| 413 |
{preview}
|
| 414 |
|
| 415 |
+
*Powered by free Jina Reader API*
|
| 416 |
"""
|
| 417 |
return output, result
|
| 418 |
else:
|
| 419 |
error = result.get("error", "Unknown error")
|
| 420 |
+
suggestion = result.get("suggestion", "")
|
| 421 |
|
| 422 |
+
suggestion_text = f"\n\n{suggestion}" if suggestion else ""
|
|
|
|
|
|
|
| 423 |
|
| 424 |
return f"## β Error\n\n{error}{suggestion_text}", result
|
| 425 |
|
|
|
|
| 436 |
gr.JSON(label="API Response")
|
| 437 |
],
|
| 438 |
title="π Free Content Extractor for n8n",
|
| 439 |
+
description="Uses free Jina Reader API to extract content (handles JavaScript websites)",
|
| 440 |
examples=[
|
| 441 |
["https://www.sinchew.com.my/"],
|
| 442 |
["https://example.com"],
|
| 443 |
["https://en.wikipedia.org/wiki/Artificial_intelligence"],
|
| 444 |
+
["https://news.ycombinator.com"],
|
| 445 |
+
["https://zhihu.com"]
|
| 446 |
]
|
| 447 |
)
|
| 448 |
|
|
|
|
| 460 |
print("\n" + "="*60)
|
| 461 |
print("π Free Content Extractor Starting")
|
| 462 |
print("="*60)
|
| 463 |
+
print("Primary method: Jina Reader API")
|
| 464 |
+
print("Secondary: Direct requests + Googlebot")
|
|
|
|
|
|
|
| 465 |
print("="*60)
|
| 466 |
print("API Endpoint: POST /extract")
|
| 467 |
+
print("Direct Jina: GET https://r.jina.ai/your-url")
|
| 468 |
print("="*60 + "\n")
|
| 469 |
|
| 470 |
uvicorn.run(
|