Luminous

Sleeping

App Files Files Community

Luminous / main.py

jeeltcraft

Update main.py

5e7aadc verified 3 months ago

raw

history blame

12.7 kB

	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from typing import List, Optional
	from llama_cpp import Llama
	import os
	import uuid
	import time
	import re

	# ============== Pydantic Models ==============

	class Validation(BaseModel):
	prompt: str

	class EthConversionRequest(BaseModel):
	value: float
	from_unit: str = "eth" # eth, gwei, or wei

	# OpenAI-compatible models
	class Message(BaseModel):
	role: str
	content: str

	class ChatCompletionRequest(BaseModel):
	model: str
	messages: List[Message]
	temperature: Optional[float] = 0.7
	max_tokens: Optional[int] = 1024
	stream: Optional[bool] = False

	class Choice(BaseModel):
	index: int
	message: Message
	finish_reason: str

	class Usage(BaseModel):
	prompt_tokens: int
	completion_tokens: int
	total_tokens: int

	class ChatCompletionResponse(BaseModel):
	id: str
	object: str = "chat.completion"
	created: int
	model: str
	choices: List[Choice]
	usage: Usage

	# ============== FastAPI App ==============

	app = FastAPI(
	title="Luminous API",
	description="""
	## Luminous Coding Assistant API

	OpenAI-compatible API powered by Qwen2.5-Coder-7B for code generation and assistance.

	### Features
	* 🤖 AI-powered code generation with Qwen2.5-Coder (GGUF quantized)
	* 🔌 OpenAI-compatible endpoints for Cursor IDE integration
	* 💰 ETH unit conversion utilities (Wei ↔ Gwei ↔ ETH)
	* 💻 Optimized for coding tasks and assistance
	* ⚡ Fast inference with llama.cpp

	### Integration with Cursor IDE
	1. Go to Cursor Settings → Models → Override OpenAI Base URL
	2. Set Base URL: `https://jeeltcraft-luminous.hf.space/v1`
	3. Model name: `qwen2.5-coder-7b` or `gpt-4`
	4. Add any dummy API key
	""",
	version="1.0.0",
	contact={
	"name": "Jeeltcraft",
	"url": "https://huggingface.co/jeeltcraft",
	},
	license_info={
	"name": "MIT",
	},
	openapi_tags=[
	{
	"name": "OpenAI Compatible",
	"description": "Endpoints compatible with OpenAI API for Cursor integration",
	},
	{
	"name": "LLM",
	"description": "Direct language model inference endpoints",
	},
	{
	"name": "Utilities",
	"description": "ETH conversion and helper functions",
	},
	],
	swagger_ui_parameters={
	"deepLinking": True,
	"displayRequestDuration": True,
	"docExpansion": "none",
	"syntaxHighlight.theme": "monokai",
	"defaultModelsExpandDepth": 2,
	}
	)

	# Global variable to hold the model
	_llm_model = None

	# ============== LLM Functions ==============

	def get_llm():
	"""
	Lazy load the GGUF model using llama-cpp-python.
	Model files are preloaded by Hugging Face Spaces during build time.
	"""
	global _llm_model

	if _llm_model is None:
	try:
	# Model path after preload_from_hub
	model_repo = "CISCai/Qwen2.5-Coder-7B-Instruct-SOTA-GGUF"
	model_file = "Qwen2.5-Coder-7B-Instruct.IQ4_XS.gguf"

	# Build full path
	from huggingface_hub import hf_hub_download
	model_path = hf_hub_download(
	repo_id=model_repo,
	filename=model_file,
	cache_dir="/root/.cache/huggingface/hub"
	)

	print(f"📦 Loading model from: {model_path}")

	_llm_model = Llama(
	model_path=model_path,
	n_ctx=2048, # Context window
	n_threads=4, # CPU threads to use
	n_gpu_layers=0, # 0 for CPU only
	verbose=False, # Reduce logging
	seed=42 # For reproducibility
	)

	print("✓ Model loaded successfully with llama.cpp")
	except Exception as e:
	print(f"✗ Error loading model: {e}")
	raise

	return _llm_model

	def call_llm(prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
	"""
	Generate response using the preloaded GGUF model via llama.cpp.
	"""
	try:
	llm = get_llm()

	response = llm(
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=0.95,
	repeat_penalty=1.1,
	stop=["</s>", "<\|user\|>", "<\|im_end\|>", "<\|im_start\|>"],
	echo=False # Don't include the prompt in output
	)

	# Extract text from llama.cpp response
	return response['choices'][0]['text'].strip()
	except Exception as e:
	return f"Error during inference: {str(e)}"

	# ============== Helper Functions ==============

	def convert_eth_units(value: float, from_unit: str = "eth") -> dict:
	"""
	Convert ETH value to wei and gwei.

	Args:
	value: The numeric value to convert
	from_unit: The source unit ('eth', 'gwei', or 'wei')

	Returns:
	Dictionary with conversions to all units
	"""
	# Convert input to wei first
	if from_unit.lower() == "eth":
	wei_value = int(value * 10**18)
	elif from_unit.lower() == "gwei":
	wei_value = int(value * 10**9)
	elif from_unit.lower() == "wei":
	wei_value = int(value)
	else:
	raise ValueError("Invalid unit. Use 'eth', 'gwei', or 'wei'")

	# Convert wei to all units
	eth_value = wei_value / 10**18
	gwei_value = wei_value / 10**9

	return {
	"input": {
	"value": value,
	"unit": from_unit
	},
	"conversions": {
	"wei": str(wei_value), # String to avoid JavaScript number overflow
	"gwei": gwei_value,
	"eth": eth_value
	},
	"formatted": {
	"wei": f"{wei_value:,} wei",
	"gwei": f"{gwei_value:,.2f} gwei",
	"eth": f"{eth_value:.18f} ETH"
	}
	}

	# ============== Startup Event ==============

	@app.on_event("startup")
	async def startup_event():
	"""
	Pre-load the model during startup to avoid timeout on first request.
	"""
	print("🚀 Starting up Luminous API...")
	try:
	get_llm() # This will load the model
	print("✅ Model loaded and ready!")
	except Exception as e:
	print(f"⚠️ Warning: Could not pre-load model: {e}")
	print("Model will be loaded on first request.")

	# ============== OpenAI-Compatible Endpoints ==============

	@app.post(
	"/v1/chat/completions",
	response_model=ChatCompletionResponse,
	tags=["OpenAI Compatible"],
	summary="Create chat completion",
	response_description="Returns the model's response to the conversation"
	)
	async def chat_completions(request: ChatCompletionRequest):
	"""
	Create a chat completion using OpenAI-compatible format.

	This endpoint is designed for integration with Cursor IDE and other
	OpenAI-compatible clients. It accepts a conversation history and returns
	the model's response.

	## Parameters
	- model: Model identifier (use `qwen2.5-coder-7b` or `gpt-4` for this API)
	- messages: Array of conversation messages with role and content
	- Role can be: `system`, `user`, or `assistant`
	- temperature: Controls randomness (0.0 = deterministic, 2.0 = very random)
	- max_tokens: Maximum number of tokens to generate in the response
	- stream: Whether to stream the response (not yet implemented)

	## Example Request
	```json
	{
	"model": "qwen2.5-coder-7b",
	"messages": [
	{"role": "system", "content": "You are a helpful coding assistant."},
	{"role": "user", "content": "Write a Python function to reverse a string"}
	],
	"temperature": 0.7,
	"max_tokens": 512
	}
	```

	## Returns
	A chat completion response with the model's generated text, token usage,
	and other metadata.
	"""
	try:
	# Extract the last user message from conversation history
	user_message = next(
	(msg.content for msg in reversed(request.messages) if msg.role == "user"),
	""
	)

	# Get system message if provided
	system_message = next(
	(msg.content for msg in request.messages if msg.role == "system"),
	"You are a helpful coding assistant."
	)

	# Format prompt for Qwen2.5-Coder using ChatML format
	formatted_prompt = f"<\|im_start\|>system\n{system_message}<\|im_end\|>\n<\|im_start\|>user\n{user_message}<\|im_end\|>\n<\|im_start\|>assistant\n"

	# Call LLM
	response_text = call_llm(
	formatted_prompt,
	max_tokens=request.max_tokens,
	temperature=request.temperature
	)

	# Simple token counting (word-based estimation)
	prompt_tokens = len(user_message.split())
	completion_tokens = len(response_text.split())

	return ChatCompletionResponse(
	id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
	created=int(time.time()),
	model=request.model,
	choices=[
	Choice(
	index=0,
	message=Message(role="assistant", content=response_text),
	finish_reason="stop"
	)
	],
	usage=Usage(
	prompt_tokens=prompt_tokens,
	completion_tokens=completion_tokens,
	total_tokens=prompt_tokens + completion_tokens
	)
	)
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error: {str(e)}")

	@app.get(
	"/v1/models",
	tags=["OpenAI Compatible"],
	summary="List available models",
	response_description="Returns a list of available models"
	)
	async def list_models():
	"""
	List all available models in OpenAI-compatible format.

	This endpoint returns the models available through this API.
	Use the model ID when making requests to `/v1/chat/completions`.
	"""
	return {
	"object": "list",
	"data": [
	{
	"id": "qwen2.5-coder-7b",
	"object": "model",
	"created": int(time.time()),
	"owned_by": "jeeltcraft"
	},
	{
	"id": "gpt-4", # Alias for better Cursor compatibility
	"object": "model",
	"created": int(time.time()),
	"owned_by": "jeeltcraft"
	}
	]
	}

	# ============== Direct LLM Endpoints ==============

	@app.post(
	"/llm_on_cpu",
	tags=["LLM"],
	summary="Direct LLM inference",
	response_description="Returns the model's raw response"
	)
	async def stream(item: Validation):
	"""
	Direct inference endpoint for simple prompts.

	This endpoint provides direct access to the LLM without the OpenAI wrapper.
	Useful for custom prompt formatting.

	- prompt: Your input text prompt
	"""
	system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.'
	E_INST = "</s>"
	user, assistant = "<\|user\|>", "<\|assistant\|>"
	prompt = f"{system_prompt}{E_INST}\n{user}\n{item.prompt.strip()}{E_INST}\n{assistant}\n"
	return {"response": call_llm(prompt)}

	# ============== Utility Endpoints ==============

	@app.post(
	"/convert_eth_units",
	tags=["Utilities"],
	summary="Convert ETH units (ETH ↔ Gwei ↔ Wei)",
	response_description="Returns conversions to all ETH units"
	)
	async def convert_units(request: EthConversionRequest):
	"""
	Convert between Ethereum units: ETH, Gwei, and Wei.

	## Ethereum Units Explained
	- ETH: The base unit (1 ETH = 1,000,000,000,000,000,000 wei)
	- Gwei: Gigawei, commonly used for gas prices (1 Gwei = 1,000,000,000 wei)
	- Wei: The smallest unit of Ether (1 wei = 0.000000000000000001 ETH)

	## Parameters
	- value: The numeric value to convert
	- from_unit: Source unit - `eth`, `gwei`, or `wei` (default: `eth`)

	## Example Requests

	Convert 1 ETH to all units:
	```json
	{
	"value": 1,
	"from_unit": "eth"
	}
	```

	Convert 50 Gwei to all units:
	```json
	{
	"value": 50,
	"from_unit": "gwei"
	}
	```

	## Returns
	Conversions to Wei, Gwei, and ETH with both numeric and formatted values.
	"""
	try:
	result = convert_eth_units(req