import gradio as gr import subprocess import time import requests from openai import OpenAI from huggingface_hub import login, snapshot_download import os import stat import tarfile import io TITLE = "Zero-shot Anime Knowledge Optimizer" DESCRIPTION = """ """ hf_token = os.getenv("HF_TOKEN") if hf_token: login(token=hf_token) else: raise ValueError("environment variable HF_TOKEN not found.") repo_id = "Johnny-Z/ZAKO-0.6B" repo_dir = snapshot_download(repo_id, repo_type='dataset') tar_path = os.path.join(repo_dir, "llama-b7972-bin-ubuntu-x64.tar.gz") current_dir = os.path.dirname(os.path.abspath(__file__)) with tarfile.open(tar_path, mode="r:gz") as tar: try: tar.extractall(path=current_dir, filter="data") except TypeError: tar.extractall(path=current_dir) def _find_llama_server(base_dir: str) -> str: for root, _, files in os.walk(base_dir): if "llama-server" in files: return os.path.join(root, "llama-server") raise FileNotFoundError(f"未找到 llama-server,可执行文件不在 {base_dir} 及其子目录中") def get_predicted_tokens_seconds() -> str: try: resp = requests.get("http://localhost:8188/metrics", timeout=2) resp.raise_for_status() for line in resp.text.splitlines(): if line.startswith("llamacpp:predicted_tokens_seconds"): parts = line.split() if len(parts) >= 2: return parts[-1] return "N/A" except requests.RequestException: return "N/A" PATH_TO_SERVER_BINARY = _find_llama_server(current_dir) PATH_TO_MODEL = os.path.join(repo_dir, "ZAKO-0.6B-Q4KM.gguf") st = os.stat(PATH_TO_SERVER_BINARY) os.chmod(PATH_TO_SERVER_BINARY, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) def wait_for_server(url: str, timeout_s: int = 180, interval_s: float = 0.5, process: subprocess.Popen | None = None) -> None: start = time.time() while time.time() - start < timeout_s: if process and process.poll() is not None: stderr = process.stderr.read().decode("utf-8", errors="ignore") if process.stderr else "" raise RuntimeError(f"本地推理引擎启动失败,退出码={process.returncode}\n{stderr}") try: resp = requests.get(url, timeout=2) if resp.status_code == 200: return except requests.RequestException: pass time.sleep(interval_s) raise TimeoutError("本地推理引擎启动超时") server_process = subprocess.Popen( [ PATH_TO_SERVER_BINARY, "-m", PATH_TO_MODEL, "--ctx-size", "1280", "--port", "8188", "--metrics" ], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE ) print("正在启动本地推理引擎...") wait_for_server("http://localhost:8188/health", process=server_process) client = OpenAI( base_url="http://localhost:8188/v1", api_key="sk-no-key-required" ) def chat(question, tags, preference_level): prompt = f""" # Role Act as an image prompt writer. Your goal is to transform inputs into **objective, physical descriptions**. You must convert abstract concepts into concrete scenes, specifying composition, lighting, and textures. Any text to be rendered must be enclosed in double quotes `""` with its typography described. **Strictly avoid** subjective adjectives or quality tags (e.g. "8K", "Masterpiece", "Best Quality"). Output **only** the final visual description. # User Input Prompt Quality: {preference_level} """ if len(tags.strip()) > 0: prompt += f"\nTags: {tags}" if len(question.strip()) > 0: prompt += f"\nQuestion: {question}" messages = [ {"role": "user", "content": prompt} ] response = client.chat.completions.create( model="ZAKO", messages=messages, top_p=0.8, temperature=0.8, stream=True ) output = "" for chunk in response: if chunk.choices[0].delta.content: output += chunk.choices[0].delta.content predicted_tokens_seconds = get_predicted_tokens_seconds() yield output, predicted_tokens_seconds def main(): with gr.Blocks(title=TITLE) as demo: with gr.Column(): gr.Markdown( value=f"