Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| app = FastAPI() | |
| qwen3_gguf_llm = Llama.from_pretrained( | |
| repo_id="unsloth/Qwen3-0.6B-GGUF", | |
| filename="Qwen3-0.6B-UD-Q8_K_XL.gguf" | |
| ) | |
| class PromptRequest(BaseModel): | |
| prompt: str | |
| class GenerateResponse(BaseModel): | |
| generated_text: str | |
| # Simple in-memory conversation memory (list of messages) | |
| conversation_history = [] | |
| async def generate_qwen3_gguf_endpoint(request: PromptRequest): | |
| # Append user message to history | |
| conversation_history.append({"role": "user", "content": request.prompt}) | |
| # Call the model with full conversation history | |
| response = qwen3_gguf_llm.create_chat_completion(messages=conversation_history) | |
| # Extract assistant reply | |
| assistant_message = response['choices'][0]['message']['content'] | |
| # Append assistant reply to history | |
| conversation_history.append({"role": "assistant", "content": assistant_message}) | |
| return GenerateResponse(generated_text=assistant_message) | |