Spaces:
Running
Running
File size: 15,235 Bytes
599cc0d e43cd24 599cc0d e43cd24 599cc0d e43cd24 599cc0d d13f5bc 599cc0d d13f5bc 599cc0d d13f5bc 599cc0d d13f5bc 599cc0d 12d3d4d 599cc0d 12d3d4d 599cc0d 12d3d4d 599cc0d d13f5bc 599cc0d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 | """
Jina AI Reader Adapter
Extracts clean, full article content from URLs using Jina AI Reader API.
Removes ads, navigation, boilerplate, and returns markdown-formatted text.
Features:
- Async execution with timeout
- Parallel extraction for multiple URLs
- Graceful fallback to snippets on failure
- No API key required (free tier)
- 71x more content than snippets (14,000 vs 200 chars)
Integration:
- Enhances DuckDuckGo live search results
- Replaces 200-char snippets with full articles
- Improves LLM context quality dramatically
"""
import logging
import asyncio
import httpx
from typing import List, Dict, Any, Optional
from datetime import datetime
logger = logging.getLogger(__name__)
class JinaReaderAdapter:
"""
Adapter for Jina AI Reader API.
Extracts full article content from URLs to enhance RAG context quality.
"""
def __init__(
self,
timeout: float = 10.0,
max_concurrent: int = 5,
base_url: str = "https://r.jina.ai"
):
"""
Initialize Jina Reader adapter.
Args:
timeout: Maximum time to wait per article (seconds)
max_concurrent: Maximum parallel extractions
base_url: Jina Reader API base URL
"""
self.base_url = base_url
self.timeout = timeout
self.max_concurrent = max_concurrent
self.client = None
logger.info(
f"Jina Reader initialized: timeout={timeout}s, "
f"max_concurrent={max_concurrent}"
)
async def _ensure_client(self):
"""Lazy initialization of HTTP client with optional API key auth"""
if self.client is None:
headers = {
"User-Agent": "ARKI-AI-RAG/2.4 (Ethiopia News Assistant)",
"Accept": "text/plain, text/markdown",
}
# Add Jina API key if available (required for most sites)
try:
from src.core.config import settings
jina_key = getattr(settings, "JINA_API_KEY", "")
if jina_key and jina_key not in ("", "your-jina-api-key-here"):
headers["Authorization"] = f"Bearer {jina_key}"
logger.info("Jina Reader: using API key authentication")
else:
logger.warning("Jina Reader: no API key set β most sites will return 401. Get free key at https://jina.ai")
except Exception:
pass
self.client = httpx.AsyncClient(
timeout=self.timeout,
follow_redirects=True,
headers=headers
)
async def extract_article(self, url: str) -> Dict[str, Any]:
"""
Extract clean article content from a single URL.
Args:
url: Article URL to extract
Returns:
Dict with:
- success: bool
- url: str
- title: str (if success)
- content: str (if success)
- length: int (if success)
- error: str (if failure)
"""
await self._ensure_client()
logger.debug(f"Extracting article: {url[:80]}")
try:
# Jina Reader API: https://r.jina.ai/{url}
jina_url = f"{self.base_url}/{url}"
response = await self.client.get(jina_url)
if response.status_code == 200:
content = response.text
# Parse markdown response
lines = content.split('\n')
# Extract title (first line, usually starts with # or Title:)
title = ""
if lines:
first_line = lines[0]
title = (
first_line
.replace('# ', '')
.replace('Title: ', '')
.strip()
)
# Extract body (skip title and empty lines)
body_lines = []
for i, line in enumerate(lines):
if i == 0: # Skip title line
continue
if line.strip(): # Skip empty lines at start
body_lines = lines[i:]
break
body = '\n'.join(body_lines).strip()
# ββ Strip boilerplate: navigation, footer, archives βββββββββββ
# Jina extracts the full page markdown including nav/footer.
# We cut at the first sign of boilerplate to keep only the article.
body = self._strip_boilerplate(body)
# Validate content
if not body or len(body) < 100:
logger.warning(
f"Jina returned insufficient content for {url[:50]} "
f"({len(body)} chars)"
)
return {
"success": False,
"url": url,
"error": "Insufficient content extracted"
}
logger.info(
f"β
Jina extracted {len(body):,} chars from {url[:50]}"
)
return {
"success": True,
"url": url,
"title": title or "Untitled",
"content": body,
"length": len(body),
"extracted_at": datetime.utcnow().isoformat()
}
elif response.status_code == 451:
# 451 Unavailable For Legal Reasons (geo-blocking)
logger.debug(f"Jina: 451 geo-blocked for {url[:50]}")
return {
"success": False,
"url": url,
"error": "Content geo-blocked"
}
elif response.status_code == 404:
logger.debug(f"Jina: 404 not found for {url[:50]}")
return {
"success": False,
"url": url,
"error": "Article not found"
}
else:
logger.debug(
f"Jina returned status {response.status_code} for {url[:50]}"
)
return {
"success": False,
"url": url,
"error": f"HTTP {response.status_code}"
}
except asyncio.TimeoutError:
logger.debug(f"Jina timeout ({self.timeout}s) for {url[:50]}")
return {
"success": False,
"url": url,
"error": "Extraction timeout"
}
except Exception as e:
logger.debug(f"Jina extraction error for {url[:50]}: {e}")
return {
"success": False,
"url": url,
"error": str(e)
}
async def extract_multiple(
self,
urls: List[str],
max_articles: Optional[int] = None
) -> List[Dict[str, Any]]:
"""
Extract content from multiple URLs in parallel.
Args:
urls: List of article URLs
max_articles: Maximum articles to extract (default: max_concurrent)
Returns:
List of extraction results (same order as input URLs)
"""
if not urls:
return []
# Limit number of articles
max_articles = max_articles or self.max_concurrent
urls_to_extract = urls[:max_articles]
logger.info(
f"Extracting {len(urls_to_extract)} articles in parallel "
f"(max_concurrent={self.max_concurrent})"
)
# Create tasks for parallel extraction
tasks = [self.extract_article(url) for url in urls_to_extract]
# Execute with semaphore to limit concurrency
semaphore = asyncio.Semaphore(self.max_concurrent)
async def bounded_extract(task):
async with semaphore:
return await task
results = await asyncio.gather(
*[bounded_extract(task) for task in tasks],
return_exceptions=True
)
# Handle exceptions
processed_results = []
for i, result in enumerate(results):
if isinstance(result, Exception):
logger.error(f"Extraction failed for {urls_to_extract[i][:50]}: {result}")
processed_results.append({
"success": False,
"url": urls_to_extract[i],
"error": str(result)
})
else:
processed_results.append(result)
# Log summary
successful = sum(1 for r in processed_results if r.get("success"))
total_chars = sum(r.get("length", 0) for r in processed_results if r.get("success"))
logger.info(
f"Jina extraction complete: {successful}/{len(processed_results)} successful, "
f"{total_chars:,} total chars"
)
return processed_results
async def enhance_search_results(
self,
search_results: List[Dict[str, Any]],
fallback_to_snippet: bool = True
) -> List[Dict[str, Any]]:
"""
Enhance search results by replacing snippets with full articles.
Args:
search_results: List of search results with URLs and snippets
fallback_to_snippet: Keep original snippet if extraction fails
Returns:
Enhanced search results with full article content
"""
if not search_results:
return []
# Extract URLs
urls = [r.get("url") for r in search_results if r.get("url")]
if not urls:
logger.warning("No URLs found in search results")
return search_results
# Extract full articles
extractions = await self.extract_multiple(urls)
# Merge extractions back into search results
enhanced_results = []
for i, result in enumerate(search_results):
enhanced = dict(result) # Copy original
if i < len(extractions):
extraction = extractions[i]
if extraction.get("success"):
# Replace snippet with full article
enhanced["content"] = extraction["content"]
enhanced["full_article"] = True
enhanced["content_length"] = extraction["length"]
enhanced["jina_title"] = extraction.get("title")
enhanced["extracted_at"] = extraction.get("extracted_at")
logger.debug(
f"Enhanced result {i+1}: {extraction['length']:,} chars "
f"(was {len(result.get('content', ''))}) chars"
)
else:
# Extraction failed
enhanced["full_article"] = False
enhanced["jina_error"] = extraction.get("error")
if not fallback_to_snippet:
# Remove result if fallback disabled
logger.debug(
f"Skipping result {i+1}: Jina failed and fallback disabled"
)
continue
else:
logger.debug(
f"Keeping snippet for result {i+1}: {extraction.get('error')}"
)
enhanced_results.append(enhanced)
# Log enhancement summary
full_articles = sum(1 for r in enhanced_results if r.get("full_article"))
snippets = len(enhanced_results) - full_articles
logger.info(
f"Enhanced {len(enhanced_results)} results: "
f"{full_articles} full articles, {snippets} snippets"
)
return enhanced_results
async def close(self):
"""Close HTTP client"""
if self.client:
await self.client.aclose()
self.client = None
logger.debug("Jina Reader client closed")
def _strip_boilerplate(self, content: str, max_chars: int = 8000) -> str:
"""
Strip navigation, footer, archives and other boilerplate from
Jina-extracted markdown. Keeps only the article body.
Strategy:
1. Cut at common boilerplate section markers
2. Hard cap at max_chars to avoid sending 176K chars to the LLM
"""
import re
# Markers that indicate end of article content
# Everything after these is navigation/footer/boilerplate
CUTOFF_PATTERNS = [
r'\n## (Post navigation|Archives|Categories|Recent Posts|Search|Newsletter|Socials|Tags|Related)',
r'\n### (Post navigation|Archives|Categories|Recent Posts|Related)',
r'\n\* \[Home\]\(', # Navigation list starting with Home
r'\n\* \[Facebook\]\(', # Social links
r'\nCopyright Β©',
r'\n---\n.*\n---', # Horizontal rules often mark footer
r'\nShare on (Facebook|Twitter|X|LinkedIn)',
r'\n## Search\n',
r'\n## Newsletter\n',
r'\n## Socials\n',
]
for pattern in CUTOFF_PATTERNS:
match = re.search(pattern, content, re.IGNORECASE)
if match:
content = content[:match.start()].strip()
break
# Hard cap β LLM context window protection
if len(content) > max_chars:
# Try to cut at a paragraph boundary
cutoff = content[:max_chars].rfind('\n\n')
if cutoff > max_chars * 0.7:
content = content[:cutoff].strip()
else:
content = content[:max_chars].strip()
return content
def is_available(self) -> bool:
"""Check if Jina Reader is available"""
# Jina Reader is always available (no API key required)
return True
# Module-level singleton for easy import
_default_adapter = None
def get_jina_reader_adapter(
timeout: float = 10.0,
max_concurrent: int = 5
) -> JinaReaderAdapter:
"""
Get or create the default Jina Reader adapter instance.
Args:
timeout: Extraction timeout in seconds
max_concurrent: Maximum parallel extractions
Returns:
JinaReaderAdapter instance
"""
global _default_adapter
if _default_adapter is None:
_default_adapter = JinaReaderAdapter(
timeout=timeout,
max_concurrent=max_concurrent
)
return _default_adapter
|