Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| from typing import List, Optional | |
| from llama_cpp import Llama | |
| import os | |
| import uuid | |
| import time | |
| import re | |
| # ============== Pydantic Models ============== | |
| class Validation(BaseModel): | |
| prompt: str | |
| class EthConversionRequest(BaseModel): | |
| value: float | |
| from_unit: str = "eth" # eth, gwei, or wei | |
| # OpenAI-compatible models | |
| class Message(BaseModel): | |
| role: str | |
| content: str | |
| class ChatCompletionRequest(BaseModel): | |
| model: str | |
| messages: List[Message] | |
| temperature: Optional[float] = 0.7 | |
| max_tokens: Optional[int] = 1024 | |
| stream: Optional[bool] = False | |
| class Choice(BaseModel): | |
| index: int | |
| message: Message | |
| finish_reason: str | |
| class Usage(BaseModel): | |
| prompt_tokens: int | |
| completion_tokens: int | |
| total_tokens: int | |
| class ChatCompletionResponse(BaseModel): | |
| id: str | |
| object: str = "chat.completion" | |
| created: int | |
| model: str | |
| choices: List[Choice] | |
| usage: Usage | |
| # ============== FastAPI App ============== | |
| app = FastAPI( | |
| title="Luminous API", | |
| description=""" | |
| ## Luminous Coding Assistant API | |
| OpenAI-compatible API powered by Qwen2.5-Coder-7B for code generation and assistance. | |
| ### Features | |
| * π€ AI-powered code generation with Qwen2.5-Coder (GGUF quantized) | |
| * π OpenAI-compatible endpoints for Cursor IDE integration | |
| * π° ETH unit conversion utilities (Wei β Gwei β ETH) | |
| * π» Optimized for coding tasks and assistance | |
| * β‘ Fast inference with llama.cpp | |
| ### Integration with Cursor IDE | |
| 1. Go to Cursor Settings β Models β Override OpenAI Base URL | |
| 2. Set Base URL: `https://jeeltcraft-luminous.hf.space/v1` | |
| 3. Model name: `qwen2.5-coder-7b` or `gpt-4` | |
| 4. Add any dummy API key | |
| """, | |
| version="1.0.0", | |
| contact={ | |
| "name": "Jeeltcraft", | |
| "url": "https://huggingface.co/jeeltcraft", | |
| }, | |
| license_info={ | |
| "name": "MIT", | |
| }, | |
| openapi_tags=[ | |
| { | |
| "name": "OpenAI Compatible", | |
| "description": "Endpoints compatible with OpenAI API for Cursor integration", | |
| }, | |
| { | |
| "name": "LLM", | |
| "description": "Direct language model inference endpoints", | |
| }, | |
| { | |
| "name": "Utilities", | |
| "description": "ETH conversion and helper functions", | |
| }, | |
| ], | |
| swagger_ui_parameters={ | |
| "deepLinking": True, | |
| "displayRequestDuration": True, | |
| "docExpansion": "none", | |
| "syntaxHighlight.theme": "monokai", | |
| "defaultModelsExpandDepth": 2, | |
| } | |
| ) | |
| # Global variable to hold the model | |
| _llm_model = None | |
| # ============== LLM Functions ============== | |
| def get_llm(): | |
| """ | |
| Lazy load the GGUF model using llama-cpp-python. | |
| Model files are preloaded by Hugging Face Spaces during build time. | |
| """ | |
| global _llm_model | |
| if _llm_model is None: | |
| try: | |
| # Model path after preload_from_hub | |
| model_repo = "CISCai/Qwen2.5-Coder-7B-Instruct-SOTA-GGUF" | |
| model_file = "Qwen2.5-Coder-7B-Instruct.IQ4_XS.gguf" | |
| # Build full path | |
| from huggingface_hub import hf_hub_download | |
| model_path = hf_hub_download( | |
| repo_id=model_repo, | |
| filename=model_file, | |
| cache_dir="/root/.cache/huggingface/hub" | |
| ) | |
| print(f"π¦ Loading model from: {model_path}") | |
| _llm_model = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, # Context window | |
| n_threads=4, # CPU threads to use | |
| n_gpu_layers=0, # 0 for CPU only | |
| verbose=False, # Reduce logging | |
| seed=42 # For reproducibility | |
| ) | |
| print("β Model loaded successfully with llama.cpp") | |
| except Exception as e: | |
| print(f"β Error loading model: {e}") | |
| raise | |
| return _llm_model | |
| def call_llm(prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str: | |
| """ | |
| Generate response using the preloaded GGUF model via llama.cpp. | |
| """ | |
| try: | |
| llm = get_llm() | |
| response = llm( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.95, | |
| repeat_penalty=1.1, | |
| stop=["</s>", "<|user|>", "<|im_end|>", "<|im_start|>"], | |
| echo=False # Don't include the prompt in output | |
| ) | |
| # Extract text from llama.cpp response | |
| return response['choices'][0]['text'].strip() | |
| except Exception as e: | |
| return f"Error during inference: {str(e)}" | |
| # ============== Helper Functions ============== | |
| def convert_eth_units(value: float, from_unit: str = "eth") -> dict: | |
| """ | |
| Convert ETH value to wei and gwei. | |
| Args: | |
| value: The numeric value to convert | |
| from_unit: The source unit ('eth', 'gwei', or 'wei') | |
| Returns: | |
| Dictionary with conversions to all units | |
| """ | |
| # Convert input to wei first | |
| if from_unit.lower() == "eth": | |
| wei_value = int(value * 10**18) | |
| elif from_unit.lower() == "gwei": | |
| wei_value = int(value * 10**9) | |
| elif from_unit.lower() == "wei": | |
| wei_value = int(value) | |
| else: | |
| raise ValueError("Invalid unit. Use 'eth', 'gwei', or 'wei'") | |
| # Convert wei to all units | |
| eth_value = wei_value / 10**18 | |
| gwei_value = wei_value / 10**9 | |
| return { | |
| "input": { | |
| "value": value, | |
| "unit": from_unit | |
| }, | |
| "conversions": { | |
| "wei": str(wei_value), # String to avoid JavaScript number overflow | |
| "gwei": gwei_value, | |
| "eth": eth_value | |
| }, | |
| "formatted": { | |
| "wei": f"{wei_value:,} wei", | |
| "gwei": f"{gwei_value:,.2f} gwei", | |
| "eth": f"{eth_value:.18f} ETH" | |
| } | |
| } | |
| # ============== Startup Event ============== | |
| async def startup_event(): | |
| """ | |
| Pre-load the model during startup to avoid timeout on first request. | |
| """ | |
| print("π Starting up Luminous API...") | |
| try: | |
| get_llm() # This will load the model | |
| print("β Model loaded and ready!") | |
| except Exception as e: | |
| print(f"β οΈ Warning: Could not pre-load model: {e}") | |
| print("Model will be loaded on first request.") | |
| # ============== OpenAI-Compatible Endpoints ============== | |
| async def chat_completions(request: ChatCompletionRequest): | |
| """ | |
| Create a chat completion using OpenAI-compatible format. | |
| This endpoint is designed for integration with Cursor IDE and other | |
| OpenAI-compatible clients. It accepts a conversation history and returns | |
| the model's response. | |
| ## Parameters | |
| - **model**: Model identifier (use `qwen2.5-coder-7b` or `gpt-4` for this API) | |
| - **messages**: Array of conversation messages with role and content | |
| - Role can be: `system`, `user`, or `assistant` | |
| - **temperature**: Controls randomness (0.0 = deterministic, 2.0 = very random) | |
| - **max_tokens**: Maximum number of tokens to generate in the response | |
| - **stream**: Whether to stream the response (not yet implemented) | |
| ## Example Request | |
| ```json | |
| { | |
| "model": "qwen2.5-coder-7b", | |
| "messages": [ | |
| {"role": "system", "content": "You are a helpful coding assistant."}, | |
| {"role": "user", "content": "Write a Python function to reverse a string"} | |
| ], | |
| "temperature": 0.7, | |
| "max_tokens": 512 | |
| } | |
| ``` | |
| ## Returns | |
| A chat completion response with the model's generated text, token usage, | |
| and other metadata. | |
| """ | |
| try: | |
| # Extract the last user message from conversation history | |
| user_message = next( | |
| (msg.content for msg in reversed(request.messages) if msg.role == "user"), | |
| "" | |
| ) | |
| # Get system message if provided | |
| system_message = next( | |
| (msg.content for msg in request.messages if msg.role == "system"), | |
| "You are a helpful coding assistant." | |
| ) | |
| # Format prompt for Qwen2.5-Coder using ChatML format | |
| formatted_prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n" | |
| # Call LLM | |
| response_text = call_llm( | |
| formatted_prompt, | |
| max_tokens=request.max_tokens, | |
| temperature=request.temperature | |
| ) | |
| # Simple token counting (word-based estimation) | |
| prompt_tokens = len(user_message.split()) | |
| completion_tokens = len(response_text.split()) | |
| return ChatCompletionResponse( | |
| id=f"chatcmpl-{uuid.uuid4().hex[:8]}", | |
| created=int(time.time()), | |
| model=request.model, | |
| choices=[ | |
| Choice( | |
| index=0, | |
| message=Message(role="assistant", content=response_text), | |
| finish_reason="stop" | |
| ) | |
| ], | |
| usage=Usage( | |
| prompt_tokens=prompt_tokens, | |
| completion_tokens=completion_tokens, | |
| total_tokens=prompt_tokens + completion_tokens | |
| ) | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Error: {str(e)}") | |
| async def list_models(): | |
| """ | |
| List all available models in OpenAI-compatible format. | |
| This endpoint returns the models available through this API. | |
| Use the model ID when making requests to `/v1/chat/completions`. | |
| """ | |
| return { | |
| "object": "list", | |
| "data": [ | |
| { | |
| "id": "qwen2.5-coder-7b", | |
| "object": "model", | |
| "created": int(time.time()), | |
| "owned_by": "jeeltcraft" | |
| }, | |
| { | |
| "id": "gpt-4", # Alias for better Cursor compatibility | |
| "object": "model", | |
| "created": int(time.time()), | |
| "owned_by": "jeeltcraft" | |
| } | |
| ] | |
| } | |
| # ============== Direct LLM Endpoints ============== | |
| async def stream(item: Validation): | |
| """ | |
| Direct inference endpoint for simple prompts. | |
| This endpoint provides direct access to the LLM without the OpenAI wrapper. | |
| Useful for custom prompt formatting. | |
| - **prompt**: Your input text prompt | |
| """ | |
| system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' | |
| E_INST = "</s>" | |
| user, assistant = "<|user|>", "<|assistant|>" | |
| prompt = f"{system_prompt}{E_INST}\n{user}\n{item.prompt.strip()}{E_INST}\n{assistant}\n" | |
| return {"response": call_llm(prompt)} | |
| # ============== Utility Endpoints ============== | |
| async def convert_units(request: EthConversionRequest): | |
| """ | |
| Convert between Ethereum units: ETH, Gwei, and Wei. | |
| ## Ethereum Units Explained | |
| - **ETH**: The base unit (1 ETH = 1,000,000,000,000,000,000 wei) | |
| - **Gwei**: Gigawei, commonly used for gas prices (1 Gwei = 1,000,000,000 wei) | |
| - **Wei**: The smallest unit of Ether (1 wei = 0.000000000000000001 ETH) | |
| ## Parameters | |
| - **value**: The numeric value to convert | |
| - **from_unit**: Source unit - `eth`, `gwei`, or `wei` (default: `eth`) | |
| ## Example Requests | |
| Convert 1 ETH to all units: | |
| ```json | |
| { | |
| "value": 1, | |
| "from_unit": "eth" | |
| } | |
| ``` | |
| Convert 50 Gwei to all units: | |
| ```json | |
| { | |
| "value": 50, | |
| "from_unit": "gwei" | |
| } | |
| ``` | |
| ## Returns | |
| Conversions to Wei, Gwei, and ETH with both numeric and formatted values. | |
| """ | |
| try: | |
| result = convert_eth_units(req | |