github-actions[bot] commited on
Commit
4c52012
·
1 Parent(s): 5186e8e

🚀 Auto-deploy backend from GitHub (8c85dcd)

Browse files
Files changed (1) hide show
  1. main.py +14 -359
main.py CHANGED
@@ -1366,10 +1366,10 @@ async def call_hf_chat_async(
1366
  effective_task = (task_type or "default").strip().lower()
1367
  request_tag = f"{effective_task}-async-{int(time.time() * 1000)}"
1368
 
1369
- selection_req = InferenceRequest(
1370
  messages=messages,
1371
  model=model,
1372
- task_type=task_type,
1373
  request_tag=request_tag,
1374
  max_new_tokens=max_tokens,
1375
  temperature=temperature,
@@ -1377,168 +1377,12 @@ async def call_hf_chat_async(
1377
  repetition_penalty=repetition_penalty,
1378
  timeout_sec=timeout,
1379
  )
1380
- selected_model, _ = client._resolve_primary_model(selection_req)
1381
- model_chain = client._model_chain_for_task(effective_task, selected_model)
1382
- provider_chain = client._provider_chain_for_task(effective_task)
1383
- last_error: Optional[Exception] = None
1384
- retryable_status = {408, 429, 500, 502, 503, 504}
1385
-
1386
- for fallback_depth, model_name in enumerate(model_chain):
1387
- request_for_model = InferenceRequest(
1388
- messages=messages,
1389
- model=model_name,
1390
- task_type=task_type,
1391
- request_tag=request_tag,
1392
- max_new_tokens=max_tokens,
1393
- temperature=temperature,
1394
- top_p=top_p,
1395
- repetition_penalty=repetition_penalty,
1396
- timeout_sec=timeout,
1397
- )
1398
- for provider in provider_chain:
1399
- route = client._resolve_route_label(provider, effective_task)
1400
- if provider == "local_space":
1401
- try:
1402
- text = await _run_hf_blocking(
1403
- client._generate_with_provider,
1404
- request_for_model,
1405
- provider,
1406
- fallback_depth,
1407
- )
1408
- return _strip_repetition(text)
1409
- except Exception as exc:
1410
- last_error = exc
1411
- logger.warning(
1412
- "⚠️ Async local fallback failed: task=%s model=%s depth=%s error=%s",
1413
- effective_task,
1414
- model_name,
1415
- fallback_depth,
1416
- str(exc)[:180],
1417
- )
1418
- continue
1419
-
1420
- stream_model = model_name if ":" in model_name else f"{model_name}:fastest"
1421
- timeout_sec = client._timeout_for(request_for_model, provider)
1422
- max_retries, backoff_sec = client._retry_profile(effective_task)
1423
- headers = {
1424
- "Authorization": f"Bearer {client.hf_token}",
1425
- "Content-Type": "application/json",
1426
- "X-MathPulse-Task": effective_task,
1427
- }
1428
- if route == "pro-priority" and client.pro_route_header_name.strip():
1429
- headers[client.pro_route_header_name.strip()] = client.pro_route_header_value
1430
-
1431
- payload: Dict[str, object] = {
1432
- "model": stream_model,
1433
- "messages": messages,
1434
- "stream": False,
1435
- "max_tokens": max_tokens,
1436
- "temperature": temperature,
1437
- "top_p": top_p,
1438
- }
1439
-
1440
- async_client = await _get_hf_async_http_client()
1441
- for attempt in range(1, max_retries + 1):
1442
- start = time.perf_counter()
1443
- client._record_attempt(
1444
- task_type=effective_task,
1445
- provider=provider,
1446
- route=route,
1447
- fallback_depth=fallback_depth,
1448
- )
1449
- try:
1450
- response = await async_client.post(
1451
- client.hf_chat_url,
1452
- headers=headers,
1453
- json=payload,
1454
- timeout=_resolve_async_hf_timeout(timeout_sec),
1455
- )
1456
- latency_ms = (time.perf_counter() - start) * 1000
1457
- client._bump_bucket("status_code_counts", str(response.status_code), 1)
1458
-
1459
- if response.status_code in retryable_status and attempt < max_retries:
1460
- log_model_call(
1461
- logger,
1462
- provider=provider,
1463
- model=model_name,
1464
- endpoint=client.hf_chat_url,
1465
- latency_ms=latency_ms,
1466
- input_tokens=None,
1467
- output_tokens=None,
1468
- status="error",
1469
- error_class="HTTPRetry",
1470
- error_message=f"status={response.status_code}",
1471
- task_type=effective_task,
1472
- request_tag=request_tag,
1473
- retry_attempt=attempt,
1474
- fallback_depth=fallback_depth,
1475
- route=route,
1476
- )
1477
- client._bump_metric("retries_total", 1)
1478
- await asyncio.sleep(_hf_retry_sleep_seconds(backoff_sec, attempt))
1479
- continue
1480
-
1481
- if response.status_code != 200:
1482
- client._bump_metric("requests_error", 1)
1483
- raise RuntimeError(
1484
- f"HF inference error {response.status_code}: {response.text[:280]}"
1485
- )
1486
-
1487
- data = response.json()
1488
- text = client._extract_text(data)
1489
- log_model_call(
1490
- logger,
1491
- provider=provider,
1492
- model=model_name,
1493
- endpoint=client.hf_chat_url,
1494
- latency_ms=latency_ms,
1495
- input_tokens=None,
1496
- output_tokens=None,
1497
- status="ok",
1498
- task_type=effective_task,
1499
- request_tag=request_tag,
1500
- retry_attempt=attempt,
1501
- fallback_depth=fallback_depth,
1502
- route=route,
1503
- )
1504
- client._bump_metric("requests_ok", 1)
1505
- return _strip_repetition(text)
1506
- except Exception as exc:
1507
- latency_ms = (time.perf_counter() - start) * 1000
1508
- last_error = exc
1509
- if attempt < max_retries:
1510
- log_model_call(
1511
- logger,
1512
- provider=provider,
1513
- model=model_name,
1514
- endpoint=client.hf_chat_url,
1515
- latency_ms=latency_ms,
1516
- input_tokens=None,
1517
- output_tokens=None,
1518
- status="error",
1519
- error_class=exc.__class__.__name__,
1520
- error_message=str(exc),
1521
- task_type=effective_task,
1522
- request_tag=request_tag,
1523
- retry_attempt=attempt,
1524
- fallback_depth=fallback_depth,
1525
- route=route,
1526
- )
1527
- client._bump_metric("retries_total", 1)
1528
- await asyncio.sleep(_hf_retry_sleep_seconds(backoff_sec, attempt))
1529
- continue
1530
-
1531
- client._bump_metric("requests_error", 1)
1532
- logger.warning(
1533
- "⚠️ Async HF attempt failed: task=%s provider=%s model=%s depth=%s error=%s",
1534
- effective_task,
1535
- provider,
1536
- model_name,
1537
- fallback_depth,
1538
- str(exc)[:180],
1539
- )
1540
 
1541
- raise last_error or RuntimeError("Inference failed with empty model/provider chain")
 
 
 
 
1542
 
1543
 
1544
  async def call_hf_chat_stream_async(
@@ -1574,204 +1418,15 @@ async def call_hf_chat_stream_async(
1574
  yield str(chunk)
1575
 
1576
  client = get_inference_client()
1577
- effective_task = (task_type or "chat").strip().lower()
1578
- request_tag = f"{effective_task}-stream-async-{int(time.time() * 1000)}"
1579
-
1580
- selection_req = InferenceRequest(
1581
- messages=messages,
1582
- model=model,
1583
- task_type=task_type,
1584
- request_tag=request_tag,
1585
- max_new_tokens=max_tokens,
1586
  temperature=temperature,
1587
  top_p=top_p,
1588
- timeout_sec=timeout,
1589
- )
1590
- selected_model, _ = client._resolve_primary_model(selection_req)
1591
- model_chain = client._model_chain_for_task(effective_task, selected_model)
1592
- provider_chain = client._provider_chain_for_task(effective_task)
1593
- last_error: Optional[Exception] = None
1594
- retryable_status = {408, 429, 500, 502, 503, 504}
1595
-
1596
- for fallback_depth, model_name in enumerate(model_chain):
1597
- request_for_model = InferenceRequest(
1598
- messages=messages,
1599
- model=model_name,
1600
- task_type=task_type,
1601
- request_tag=request_tag,
1602
- max_new_tokens=max_tokens,
1603
- temperature=temperature,
1604
- top_p=top_p,
1605
- timeout_sec=timeout,
1606
- )
1607
- for provider in provider_chain:
1608
- if provider == "local_space":
1609
- last_error = RuntimeError("Streaming is not supported for local_space provider")
1610
- continue
1611
-
1612
- route = client._resolve_route_label(provider, effective_task)
1613
- stream_model = model_name if ":" in model_name else f"{model_name}:fastest"
1614
- timeout_sec = client._timeout_for(request_for_model, provider)
1615
- max_retries, backoff_sec = client._retry_profile(effective_task)
1616
-
1617
- headers = {
1618
- "Authorization": f"Bearer {client.hf_token}",
1619
- "Content-Type": "application/json",
1620
- "X-MathPulse-Task": effective_task,
1621
- }
1622
- if route == "pro-priority" and client.pro_route_header_name.strip():
1623
- headers[client.pro_route_header_name.strip()] = client.pro_route_header_value
1624
-
1625
- payload: Dict[str, object] = {
1626
- "model": stream_model,
1627
- "messages": messages,
1628
- "stream": True,
1629
- "max_tokens": max_tokens,
1630
- "temperature": temperature,
1631
- "top_p": top_p,
1632
- }
1633
-
1634
- async_client = await _get_hf_async_http_client()
1635
- for attempt in range(1, max_retries + 1):
1636
- start = time.perf_counter()
1637
- client._record_attempt(
1638
- task_type=effective_task,
1639
- provider=provider,
1640
- route=route,
1641
- fallback_depth=fallback_depth,
1642
- )
1643
- try:
1644
- async with async_client.stream(
1645
- "POST",
1646
- client.hf_chat_url,
1647
- headers=headers,
1648
- json=payload,
1649
- timeout=_resolve_async_hf_timeout(timeout_sec),
1650
- ) as response:
1651
- client._bump_bucket("status_code_counts", str(response.status_code), 1)
1652
-
1653
- if response.status_code in retryable_status and attempt < max_retries:
1654
- body = await response.aread()
1655
- body_preview = body[:220].decode("utf-8", errors="replace")
1656
- latency_ms = (time.perf_counter() - start) * 1000
1657
- log_model_call(
1658
- logger,
1659
- provider=provider,
1660
- model=model_name,
1661
- endpoint=client.hf_chat_url,
1662
- latency_ms=latency_ms,
1663
- input_tokens=None,
1664
- output_tokens=None,
1665
- status="error",
1666
- error_class="HTTPRetry",
1667
- error_message=f"status={response.status_code} body={body_preview}",
1668
- task_type=effective_task,
1669
- request_tag=request_tag,
1670
- retry_attempt=attempt,
1671
- fallback_depth=fallback_depth,
1672
- route=route,
1673
- )
1674
- client._bump_metric("retries_total", 1)
1675
- await asyncio.sleep(_hf_retry_sleep_seconds(backoff_sec, attempt))
1676
- continue
1677
-
1678
- if response.status_code != 200:
1679
- body = await response.aread()
1680
- body_preview = body[:280].decode("utf-8", errors="replace")
1681
- client._bump_metric("requests_error", 1)
1682
- raise RuntimeError(
1683
- f"HF stream error {response.status_code}: {body_preview}"
1684
- )
1685
-
1686
- emitted_any = False
1687
- async for raw_line in response.aiter_lines():
1688
- if not raw_line:
1689
- continue
1690
- line = raw_line.strip()
1691
- if not line.startswith("data:"):
1692
- continue
1693
-
1694
- data_raw = line.split("data:", 1)[1].strip()
1695
- if data_raw == "[DONE]":
1696
- continue
1697
-
1698
- try:
1699
- payload_obj = json.loads(data_raw)
1700
- except json.JSONDecodeError:
1701
- continue
1702
-
1703
- choices = payload_obj.get("choices") or []
1704
- if not choices:
1705
- continue
1706
- first = choices[0] if isinstance(choices[0], dict) else {}
1707
- delta = first.get("delta") or {}
1708
- chunk = delta.get("content")
1709
- if not chunk:
1710
- msg = first.get("message") or {}
1711
- chunk = msg.get("content")
1712
- if not chunk:
1713
- continue
1714
-
1715
- emitted_any = True
1716
- yield str(chunk)
1717
-
1718
- if emitted_any:
1719
- latency_ms = (time.perf_counter() - start) * 1000
1720
- log_model_call(
1721
- logger,
1722
- provider=provider,
1723
- model=model_name,
1724
- endpoint=client.hf_chat_url,
1725
- latency_ms=latency_ms,
1726
- input_tokens=None,
1727
- output_tokens=None,
1728
- status="ok",
1729
- task_type=effective_task,
1730
- request_tag=request_tag,
1731
- retry_attempt=attempt,
1732
- fallback_depth=fallback_depth,
1733
- route=route,
1734
- )
1735
- client._bump_metric("requests_ok", 1)
1736
- return
1737
-
1738
- raise RuntimeError("HF stream ended without content")
1739
- except Exception as exc:
1740
- latency_ms = (time.perf_counter() - start) * 1000
1741
- last_error = exc
1742
- if attempt < max_retries:
1743
- log_model_call(
1744
- logger,
1745
- provider=provider,
1746
- model=model_name,
1747
- endpoint=client.hf_chat_url,
1748
- latency_ms=latency_ms,
1749
- input_tokens=None,
1750
- output_tokens=None,
1751
- status="error",
1752
- error_class=exc.__class__.__name__,
1753
- error_message=str(exc),
1754
- task_type=effective_task,
1755
- request_tag=request_tag,
1756
- retry_attempt=attempt,
1757
- fallback_depth=fallback_depth,
1758
- route=route,
1759
- )
1760
- client._bump_metric("retries_total", 1)
1761
- await asyncio.sleep(_hf_retry_sleep_seconds(backoff_sec, attempt))
1762
- continue
1763
-
1764
- client._bump_metric("requests_error", 1)
1765
- logger.warning(
1766
- "⚠️ Async stream attempt failed: task=%s provider=%s model=%s depth=%s error=%s",
1767
- effective_task,
1768
- provider,
1769
- model_name,
1770
- fallback_depth,
1771
- str(exc)[:180],
1772
- )
1773
-
1774
- raise last_error or RuntimeError("Streaming failed with empty model/provider chain")
1775
 
1776
 
1777
  def load_local_math_model(model_name: str = "Qwen/Qwen2.5-Math-7B-Instruct"):
 
1366
  effective_task = (task_type or "default").strip().lower()
1367
  request_tag = f"{effective_task}-async-{int(time.time() * 1000)}"
1368
 
1369
+ req = InferenceRequest(
1370
  messages=messages,
1371
  model=model,
1372
+ task_type=effective_task,
1373
  request_tag=request_tag,
1374
  max_new_tokens=max_tokens,
1375
  temperature=temperature,
 
1377
  repetition_penalty=repetition_penalty,
1378
  timeout_sec=timeout,
1379
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1380
 
1381
+ try:
1382
+ return await _run_hf_blocking(client._call_deepseek, req, 0)
1383
+ except Exception as exc:
1384
+ logger.error(f"DeepSeek chat failed: {exc}")
1385
+ raise RuntimeError("AI model service is temporarily unavailable. Please try again.")
1386
 
1387
 
1388
  async def call_hf_chat_stream_async(
 
1418
  yield str(chunk)
1419
 
1420
  client = get_inference_client()
1421
+ async for chunk in client._call_deepseek_stream(
1422
+ messages,
1423
+ max_tokens=max_tokens,
 
 
 
 
 
 
1424
  temperature=temperature,
1425
  top_p=top_p,
1426
+ model=model,
1427
+ task_type=task_type,
1428
+ ):
1429
+ yield str(chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1430
 
1431
 
1432
  def load_local_math_model(model_name: str = "Qwen/Qwen2.5-Math-7B-Instruct"):