Spaces:

hugh007
/

openwolf-text

Sleeping

App Files Files Community

openwolf-text / app.py

hugh007

fix: upgrade to Q6_K (600MB) better quality

82ad52b verified 21 days ago

raw

history blame contribute delete

3.67 kB

	"""
	OpenWolf 文本 Space — llama-cpp-python（GGUF / MiniCPM-V-4.6-Thinking）
	"""
	import os, time, threading, uuid
	from fastapi import FastAPI, Request
	from fastapi.responses import JSONResponse

	app = FastAPI(title="OpenWolf Text")

	_ready = False
	_llm = None
	_llm_lock = threading.Lock()
	_tasks = {}
	MODEL_PATH = "/app/models/MiniCPM-V-4_6-Thinking-Q6_K.gguf"
	MMPROJ_PATH = "/app/models/mmproj-model-f16.gguf"


	@app.on_event("startup")
	async def startup():
	threading.Thread(target=_load_model, daemon=True).start()


	def _load_model():
	global _llm, _ready
	if not os.path.exists(MODEL_PATH):
	print(f"[models] 模型文件不存在: {MODEL_PATH}")
	return
	print("[models] 加载模型...")
	t0 = time.time()
	from llama_cpp import Llama
	try:
	_llm = Llama(model_path=MODEL_PATH, mmproj=MMPROJ_PATH,
	n_ctx=2048, n_threads=2, n_gpu_layers=0, verbose=False)
	_ready = True
	print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
	except Exception as e:
	print(f"[models] 加载失败: {e}")


	@app.get("/health")
	async def health():
	return {"status": "ok", "ready": _ready}


	@app.post("/v1/chat/completions")
	async def chat_completions(request: Request):
	if not _ready or _llm is None:
	return JSONResponse({"error": "模型加载中"}, status_code=503)
	body = await request.json()
	messages = body.get("messages", [])
	max_tokens = int(body.get("max_tokens", 512))
	temp = float(body.get("temperature", 0.3))
	with _llm_lock:
	out = _llm.create_chat_completion(messages=messages, max_tokens=max_tokens, temperature=temp)
	return {"choices": [{"message": {"content": out["choices"][0]["message"]["content"].strip()}}]}


	@app.post("/task/start")
	async def task_start(request: Request):
	body = await request.json()
	task_id = uuid.uuid4().hex[:12]
	_tasks[task_id] = {"status": "running", "result": None}
	threading.Thread(target=_do_task, args=(task_id, body), daemon=True).start()
	return {"ok": True, "task_id": task_id}


	@app.get("/task/check/{task_id}")
	async def task_check(task_id: str):
	t = _tasks.get(task_id)
	if not t: return {"status": "error", "result": "not found"}
	if t["status"] == "running": return {"status": "running"}
	return {"status": t["status"], "result": t["result"]}


	@app.post("/analyze-doc/start")
	@app.post("/analyze-text/start")
	async def analyze_start(request: Request):
	body = await request.json()
	task_id = uuid.uuid4().hex[:12]
	_tasks[task_id] = {"status": "running", "result": None}
	threading.Thread(target=_do_task, args=(task_id, body), daemon=True).start()
	return {"ok": True, "task_id": task_id}


	@app.get("/analyze-doc/check/{task_id}")
	@app.get("/analyze-text/check/{task_id}")
	async def analyze_check(task_id: str):
	t = _tasks.get(task_id)
	if not t: return {"status": "error", "result": "not found"}
	if t["status"] == "running": return {"status": "running"}
	return {"status": t["status"], "result": t["result"]}


	def _do_task(task_id, body):
	text = body.get("task", body.get("text", body.get("question", "")))
	if _llm is None:
	_tasks[task_id] = {"status": "error", "result": "模型未就绪"}
	return
	try:
	with _llm_lock:
	out = _llm.create_chat_completion(
	messages=[{"role": "user", "content": text}],
	max_tokens=2048, temperature=0.3,
	)
	_tasks[task_id] = {"status": "done", "result": out["choices"][0]["message"]["content"]}
	except Exception as e:
	_tasks[task_id] = {"status": "error", "result": str(e)}