Luminous / main.py
jeeltcraft's picture
Update main.py
5e7aadc verified
raw
history blame
12.7 kB
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
from llama_cpp import Llama
import os
import uuid
import time
import re
# ============== Pydantic Models ==============
class Validation(BaseModel):
prompt: str
class EthConversionRequest(BaseModel):
value: float
from_unit: str = "eth" # eth, gwei, or wei
# OpenAI-compatible models
class Message(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str
messages: List[Message]
temperature: Optional[float] = 0.7
max_tokens: Optional[int] = 1024
stream: Optional[bool] = False
class Choice(BaseModel):
index: int
message: Message
finish_reason: str
class Usage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class ChatCompletionResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[Choice]
usage: Usage
# ============== FastAPI App ==============
app = FastAPI(
title="Luminous API",
description="""
## Luminous Coding Assistant API
OpenAI-compatible API powered by Qwen2.5-Coder-7B for code generation and assistance.
### Features
* πŸ€– AI-powered code generation with Qwen2.5-Coder (GGUF quantized)
* πŸ”Œ OpenAI-compatible endpoints for Cursor IDE integration
* πŸ’° ETH unit conversion utilities (Wei ↔ Gwei ↔ ETH)
* πŸ’» Optimized for coding tasks and assistance
* ⚑ Fast inference with llama.cpp
### Integration with Cursor IDE
1. Go to Cursor Settings β†’ Models β†’ Override OpenAI Base URL
2. Set Base URL: `https://jeeltcraft-luminous.hf.space/v1`
3. Model name: `qwen2.5-coder-7b` or `gpt-4`
4. Add any dummy API key
""",
version="1.0.0",
contact={
"name": "Jeeltcraft",
"url": "https://huggingface.co/jeeltcraft",
},
license_info={
"name": "MIT",
},
openapi_tags=[
{
"name": "OpenAI Compatible",
"description": "Endpoints compatible with OpenAI API for Cursor integration",
},
{
"name": "LLM",
"description": "Direct language model inference endpoints",
},
{
"name": "Utilities",
"description": "ETH conversion and helper functions",
},
],
swagger_ui_parameters={
"deepLinking": True,
"displayRequestDuration": True,
"docExpansion": "none",
"syntaxHighlight.theme": "monokai",
"defaultModelsExpandDepth": 2,
}
)
# Global variable to hold the model
_llm_model = None
# ============== LLM Functions ==============
def get_llm():
"""
Lazy load the GGUF model using llama-cpp-python.
Model files are preloaded by Hugging Face Spaces during build time.
"""
global _llm_model
if _llm_model is None:
try:
# Model path after preload_from_hub
model_repo = "CISCai/Qwen2.5-Coder-7B-Instruct-SOTA-GGUF"
model_file = "Qwen2.5-Coder-7B-Instruct.IQ4_XS.gguf"
# Build full path
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(
repo_id=model_repo,
filename=model_file,
cache_dir="/root/.cache/huggingface/hub"
)
print(f"πŸ“¦ Loading model from: {model_path}")
_llm_model = Llama(
model_path=model_path,
n_ctx=2048, # Context window
n_threads=4, # CPU threads to use
n_gpu_layers=0, # 0 for CPU only
verbose=False, # Reduce logging
seed=42 # For reproducibility
)
print("βœ“ Model loaded successfully with llama.cpp")
except Exception as e:
print(f"βœ— Error loading model: {e}")
raise
return _llm_model
def call_llm(prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
"""
Generate response using the preloaded GGUF model via llama.cpp.
"""
try:
llm = get_llm()
response = llm(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.95,
repeat_penalty=1.1,
stop=["</s>", "<|user|>", "<|im_end|>", "<|im_start|>"],
echo=False # Don't include the prompt in output
)
# Extract text from llama.cpp response
return response['choices'][0]['text'].strip()
except Exception as e:
return f"Error during inference: {str(e)}"
# ============== Helper Functions ==============
def convert_eth_units(value: float, from_unit: str = "eth") -> dict:
"""
Convert ETH value to wei and gwei.
Args:
value: The numeric value to convert
from_unit: The source unit ('eth', 'gwei', or 'wei')
Returns:
Dictionary with conversions to all units
"""
# Convert input to wei first
if from_unit.lower() == "eth":
wei_value = int(value * 10**18)
elif from_unit.lower() == "gwei":
wei_value = int(value * 10**9)
elif from_unit.lower() == "wei":
wei_value = int(value)
else:
raise ValueError("Invalid unit. Use 'eth', 'gwei', or 'wei'")
# Convert wei to all units
eth_value = wei_value / 10**18
gwei_value = wei_value / 10**9
return {
"input": {
"value": value,
"unit": from_unit
},
"conversions": {
"wei": str(wei_value), # String to avoid JavaScript number overflow
"gwei": gwei_value,
"eth": eth_value
},
"formatted": {
"wei": f"{wei_value:,} wei",
"gwei": f"{gwei_value:,.2f} gwei",
"eth": f"{eth_value:.18f} ETH"
}
}
# ============== Startup Event ==============
@app.on_event("startup")
async def startup_event():
"""
Pre-load the model during startup to avoid timeout on first request.
"""
print("πŸš€ Starting up Luminous API...")
try:
get_llm() # This will load the model
print("βœ… Model loaded and ready!")
except Exception as e:
print(f"⚠️ Warning: Could not pre-load model: {e}")
print("Model will be loaded on first request.")
# ============== OpenAI-Compatible Endpoints ==============
@app.post(
"/v1/chat/completions",
response_model=ChatCompletionResponse,
tags=["OpenAI Compatible"],
summary="Create chat completion",
response_description="Returns the model's response to the conversation"
)
async def chat_completions(request: ChatCompletionRequest):
"""
Create a chat completion using OpenAI-compatible format.
This endpoint is designed for integration with Cursor IDE and other
OpenAI-compatible clients. It accepts a conversation history and returns
the model's response.
## Parameters
- **model**: Model identifier (use `qwen2.5-coder-7b` or `gpt-4` for this API)
- **messages**: Array of conversation messages with role and content
- Role can be: `system`, `user`, or `assistant`
- **temperature**: Controls randomness (0.0 = deterministic, 2.0 = very random)
- **max_tokens**: Maximum number of tokens to generate in the response
- **stream**: Whether to stream the response (not yet implemented)
## Example Request
```json
{
"model": "qwen2.5-coder-7b",
"messages": [
{"role": "system", "content": "You are a helpful coding assistant."},
{"role": "user", "content": "Write a Python function to reverse a string"}
],
"temperature": 0.7,
"max_tokens": 512
}
```
## Returns
A chat completion response with the model's generated text, token usage,
and other metadata.
"""
try:
# Extract the last user message from conversation history
user_message = next(
(msg.content for msg in reversed(request.messages) if msg.role == "user"),
""
)
# Get system message if provided
system_message = next(
(msg.content for msg in request.messages if msg.role == "system"),
"You are a helpful coding assistant."
)
# Format prompt for Qwen2.5-Coder using ChatML format
formatted_prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"
# Call LLM
response_text = call_llm(
formatted_prompt,
max_tokens=request.max_tokens,
temperature=request.temperature
)
# Simple token counting (word-based estimation)
prompt_tokens = len(user_message.split())
completion_tokens = len(response_text.split())
return ChatCompletionResponse(
id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
created=int(time.time()),
model=request.model,
choices=[
Choice(
index=0,
message=Message(role="assistant", content=response_text),
finish_reason="stop"
)
],
usage=Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens
)
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
@app.get(
"/v1/models",
tags=["OpenAI Compatible"],
summary="List available models",
response_description="Returns a list of available models"
)
async def list_models():
"""
List all available models in OpenAI-compatible format.
This endpoint returns the models available through this API.
Use the model ID when making requests to `/v1/chat/completions`.
"""
return {
"object": "list",
"data": [
{
"id": "qwen2.5-coder-7b",
"object": "model",
"created": int(time.time()),
"owned_by": "jeeltcraft"
},
{
"id": "gpt-4", # Alias for better Cursor compatibility
"object": "model",
"created": int(time.time()),
"owned_by": "jeeltcraft"
}
]
}
# ============== Direct LLM Endpoints ==============
@app.post(
"/llm_on_cpu",
tags=["LLM"],
summary="Direct LLM inference",
response_description="Returns the model's raw response"
)
async def stream(item: Validation):
"""
Direct inference endpoint for simple prompts.
This endpoint provides direct access to the LLM without the OpenAI wrapper.
Useful for custom prompt formatting.
- **prompt**: Your input text prompt
"""
system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.'
E_INST = "</s>"
user, assistant = "<|user|>", "<|assistant|>"
prompt = f"{system_prompt}{E_INST}\n{user}\n{item.prompt.strip()}{E_INST}\n{assistant}\n"
return {"response": call_llm(prompt)}
# ============== Utility Endpoints ==============
@app.post(
"/convert_eth_units",
tags=["Utilities"],
summary="Convert ETH units (ETH ↔ Gwei ↔ Wei)",
response_description="Returns conversions to all ETH units"
)
async def convert_units(request: EthConversionRequest):
"""
Convert between Ethereum units: ETH, Gwei, and Wei.
## Ethereum Units Explained
- **ETH**: The base unit (1 ETH = 1,000,000,000,000,000,000 wei)
- **Gwei**: Gigawei, commonly used for gas prices (1 Gwei = 1,000,000,000 wei)
- **Wei**: The smallest unit of Ether (1 wei = 0.000000000000000001 ETH)
## Parameters
- **value**: The numeric value to convert
- **from_unit**: Source unit - `eth`, `gwei`, or `wei` (default: `eth`)
## Example Requests
Convert 1 ETH to all units:
```json
{
"value": 1,
"from_unit": "eth"
}
```
Convert 50 Gwei to all units:
```json
{
"value": 50,
"from_unit": "gwei"
}
```
## Returns
Conversions to Wei, Gwei, and ETH with both numeric and formatted values.
"""
try:
result = convert_eth_units(req