Luminous

Sleeping

File size: 8,774 Bytes

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional, Any
import os
import uuid
import time
import re
import httpx

# ============== Pydantic Models ==============

class Validation(BaseModel):
    prompt: str

class EthConversionRequest(BaseModel):
    value: float
    from_unit: str = "eth"

class Message(BaseModel):
    role: str
    content: str

class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[Message]
    temperature: Optional[float] = 0.7
    max_tokens: Optional[int] = 8192
    stream: Optional[bool] = False
    tools: Optional[List[Any]] = None          # accept but ignore — prevents OpenCode hang
    tool_choice: Optional[Any] = None          # accept but ignore
    stop: Optional[Any] = None                 # accept but ignore
    frequency_penalty: Optional[float] = None  # accept but ignore
    presence_penalty: Optional[float] = None   # accept but ignore
    top_p: Optional[float] = None              # accept but ignore

class Choice(BaseModel):
    index: int
    message: Message
    finish_reason: str

class Usage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int

class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[Choice]
    usage: Usage

# ============== FastAPI App ==============

app = FastAPI(
    title="Luminous API",
    description="""
    ## Luminous Coding Assistant API
    OpenAI-compatible API powered by Qwen via HuggingFace Router Inference API.
    Set Base URL: `https://jeeltcraft-luminous.hf.space/v1`
    """,
    version="1.2.0",
    contact={"name": "Jeeltcraft", "url": "https://huggingface.co/jeeltcraft"},
    license_info={"name": "MIT"},
)

# ============== LLM Core ==============

HF_MODEL_ID = "Qwen/Qwen3-4B-Instruct-2507"

PRIMARY_URL  = "https://router.huggingface.co/hf-inference/v1/chat/completions"
FALLBACK_URL = "https://router.huggingface.co/v1/chat/completions"

async def call_llm(messages: list, max_tokens: int = 8192, temperature: float = 0.7) -> str:
    HF_API_TOKEN = os.environ.get("HF_TOKEN")
    if not HF_API_TOKEN:
        return "Error: HF_TOKEN not configured in Space secrets."

    headers = {
        "Authorization": f"Bearer {HF_API_TOKEN}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": HF_MODEL_ID,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": 0.95,
        "stream": False                    # always False — streaming not implemented
    }
    # NOTE: tools/tool_choice intentionally NOT forwarded to HF Router
    # Sending tools causes infinite spin loop in OpenCode with custom providers

    try:
        async with httpx.AsyncClient(timeout=180.0) as client:
            response = await client.post(PRIMARY_URL, headers=headers, json=payload)

            if response.status_code in [404, 410]:
                response = await client.post(FALLBACK_URL, headers=headers, json=payload)

            if response.status_code == 403:
                return f"Auth Error (403): Enable 'Inference Providers' on your HF token. Detail: {response.text}"

            response.raise_for_status()

            raw = response.text
            if not raw or not raw.strip():
                return "Error: HF Router returned empty response (model may be loading, retry in 10s)"

            try:
                result = response.json()
            except Exception:
                return f"Error: Non-JSON response from HF Router: {raw[:500]}"

            if "choices" in result and result["choices"]:
                return result["choices"][0]["message"]["content"]
            elif isinstance(result, list) and result:
                return result[0].get("generated_text", "No response generated")
            else:
                return f"Unexpected response format: {str(result)[:300]}"

    except httpx.TimeoutException:
        return "Error: Request timed out after 180s. Try a shorter prompt or reduce max_tokens."
    except httpx.HTTPStatusError as e:
        return f"Error: HTTP {e.response.status_code} — {e.response.text}"
    except Exception as e:
        return f"Error during inference: {str(e)}"

# ============== OpenAI-Compatible Endpoints ==============

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse, tags=["OpenAI Compatible"])
async def chat_completions(request: ChatCompletionRequest):
    """OpenAI-compatible endpoint. Passes full conversation history, strips tool calls."""
    try:
        # Pass ALL messages directly — full multi-turn history preserved for OpenCode context
        # tools/tool_choice fields are accepted by the model but NOT forwarded to HF Router
        messages = [{"role": m.role, "content": m.content} for m in request.messages]

        max_tokens = request.max_tokens if request.max_tokens is not None else 8192
        temperature = request.temperature if request.temperature is not None else 0.7

        response_text = await call_llm(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature
        )

        full_prompt = " ".join(m["content"] for m in messages)
        prompt_tokens = len(full_prompt) // 4
        completion_tokens = len(response_text) // 4

        return ChatCompletionResponse(
            id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
            created=int(time.time()),
            model=request.model,
            choices=[Choice(
                index=0,
                message=Message(role="assistant", content=response_text),
                finish_reason="stop"
            )],
            usage=Usage(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
                total_tokens=prompt_tokens + completion_tokens
            )
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error: {str(e)}")


@app.get("/v1/models", tags=["OpenAI Compatible"])
async def list_models():
    return {
        "object": "list",
        "data": [
            {
                "id": "qwen",
                "object": "model",
                "created": int(time.time()),
                "owned_by": "jeeltcraft",
                "context_length": 32768
            }
        ]
    }

# ============== Utilities ==============

def convert_eth_units(value: float, from_unit: str = "eth") -> dict:
    if from_unit.lower() == "eth":
        wei_value = int(value * 10**18)
    elif from_unit.lower() == "gwei":
        wei_value = int(value * 10**9)
    elif from_unit.lower() == "wei":
        wei_value = int(value)
    else:
        raise ValueError("Invalid unit. Use 'eth', 'gwei', or 'wei'")
    return {
        "input": {"value": value, "unit": from_unit},
        "conversions": {"wei": str(wei_value), "gwei": wei_value / 10**9, "eth": wei_value / 10**18},
        "formatted": {
            "wei": f"{wei_value:,} wei",
            "gwei": f"{wei_value / 10**9:,.2f} gwei",
            "eth": f"{wei_value / 10**18:.18f} ETH"
        }
    }

@app.post("/llm_on_cpu", tags=["LLM"])
async def llm_direct(item: Validation):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": item.prompt}
    ]
    return {"response": await call_llm(messages)}

@app.post("/convert_eth_units", tags=["Utilities"])
async def convert_units(request: EthConversionRequest):
    try:
        return convert_eth_units(request.value, request.from_unit)
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))

@app.post("/eth_to_units", tags=["Utilities"])
async def eth_to_units(item: Validation):
    match = re.search(r'\d+\.?\d*', item.prompt)
    if match:
        return convert_eth_units(float(match.group()), "eth")
    raise HTTPException(status_code=400, detail="No numeric value found")

@app.get("/quick_convert/{value}/{unit}", tags=["Utilities"])
async def quick_convert(value: float, unit: str = "eth"):
    try:
        return convert_eth_units(value, unit)
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))

@app.get("/", tags=["Utilities"])
async def root():
    return {
        "message": "Luminous API — OpenAI Compatible Coding Assistant",
        "model": HF_MODEL_ID,
        "status": "active",
        "docs": "/docs"
    }

@app.get("/health", tags=["Utilities"])
async def health_check():
    return {
        "status": "healthy",
        "model": HF_MODEL_ID,
        "hf_token_configured": bool(os.environ.get("HF_TOKEN")),
        "api_version": "1.2.0"
    }