app / src-tauri /src /proxy /zai_vision_tools.rs
AZILS's picture
Upload 323 files
a21c316 verified
use base64::Engine;
use serde_json::{json, Value};
use tokio::time::Duration;
use crate::proxy::config::UpstreamProxyConfig;
use crate::proxy::ZaiConfig;
const ZAI_PAAZ_CHAT_COMPLETIONS_URL: &str = "https://api.z.ai/api/paas/v4/chat/completions";
fn build_client(upstream_proxy: UpstreamProxyConfig, timeout_secs: u64) -> Result<reqwest::Client, String> {
let mut builder = reqwest::Client::builder()
.timeout(Duration::from_secs(timeout_secs.max(5)));
if upstream_proxy.enabled && !upstream_proxy.url.is_empty() {
let url = crate::proxy::config::normalize_proxy_url(&upstream_proxy.url);
let proxy = reqwest::Proxy::all(&url)
.map_err(|e| format!("Invalid upstream proxy url: {}", e))?;
builder = builder.proxy(proxy);
}
builder.build().map_err(|e| format!("Failed to build HTTP client: {}", e))
}
fn is_http_url(value: &str) -> bool {
let v = value.trim();
v.starts_with("http://") || v.starts_with("https://")
}
fn mime_for_image_extension(ext: &str) -> Option<&'static str> {
match ext.to_ascii_lowercase().as_str() {
"png" => Some("image/png"),
"jpg" | "jpeg" => Some("image/jpeg"),
_ => None,
}
}
fn mime_for_video_extension(ext: &str) -> Option<&'static str> {
match ext.to_ascii_lowercase().as_str() {
"mp4" => Some("video/mp4"),
"mov" => Some("video/quicktime"),
"m4v" => Some("video/x-m4v"),
_ => None,
}
}
fn file_ext(path: &std::path::Path) -> Option<String> {
path.extension()
.and_then(|s| s.to_str())
.map(|s| s.to_string())
}
fn encode_file_as_data_url(path: &std::path::Path, mime: &str) -> Result<String, String> {
let bytes = std::fs::read(path).map_err(|e| format!("Failed to read file: {}", e))?;
let encoded = base64::engine::general_purpose::STANDARD.encode(bytes);
Ok(format!("data:{};base64,{}", mime, encoded))
}
fn image_source_to_content(image_source: &str, max_size_mb: u64) -> Result<Value, String> {
if is_http_url(image_source) {
return Ok(json!({
"type": "image_url",
"image_url": { "url": image_source }
}));
}
let path = std::path::Path::new(image_source);
let meta = std::fs::metadata(path).map_err(|_| "Image file not found".to_string())?;
let max_size = max_size_mb * 1024 * 1024;
if meta.len() > max_size {
return Err(format!(
"Image file too large ({} bytes), max {} MB",
meta.len(),
max_size_mb
));
}
let ext = file_ext(path).ok_or("Unsupported image format".to_string())?;
let mime = mime_for_image_extension(&ext).ok_or("Unsupported image format".to_string())?;
let data_url = encode_file_as_data_url(path, mime)?;
Ok(json!({
"type": "image_url",
"image_url": { "url": data_url }
}))
}
fn video_source_to_content(video_source: &str, max_size_mb: u64) -> Result<Value, String> {
if is_http_url(video_source) {
return Ok(json!({
"type": "video_url",
"video_url": { "url": video_source }
}));
}
let path = std::path::Path::new(video_source);
let meta = std::fs::metadata(path).map_err(|_| "Video file not found".to_string())?;
let max_size = max_size_mb * 1024 * 1024;
if meta.len() > max_size {
return Err(format!(
"Video file too large ({} bytes), max {} MB",
meta.len(),
max_size_mb
));
}
let ext = file_ext(path).ok_or("Unsupported video format".to_string())?;
let mime = mime_for_video_extension(&ext).ok_or("Unsupported video format".to_string())?;
let data_url = encode_file_as_data_url(path, mime)?;
Ok(json!({
"type": "video_url",
"video_url": { "url": data_url }
}))
}
fn user_message_with_content(mut content: Vec<Value>, prompt: &str) -> Value {
content.push(json!({ "type": "text", "text": prompt }));
json!({ "role": "user", "content": content })
}
async fn vision_chat_completion(
client: &reqwest::Client,
api_key: &str,
system_prompt: &str,
user_content: Vec<Value>,
prompt: &str,
) -> Result<String, String> {
let body = json!({
"model": "glm-4.6v",
"messages": [
{ "role": "system", "content": system_prompt },
user_message_with_content(user_content, prompt),
],
"thinking": { "type": "enabled" },
"stream": false,
"temperature": 0.8,
"top_p": 0.6,
"max_tokens": 32768
});
let resp = client
.post(ZAI_PAAZ_CHAT_COMPLETIONS_URL)
.bearer_auth(api_key)
.header("X-Title", "Vision MCP Local")
.header("Accept-Language", "en-US,en")
.json(&body)
.send()
.await
.map_err(|e| format!("Upstream request failed: {}", e))?;
if !resp.status().is_success() {
let status = resp.status().as_u16();
let text = resp.text().await.unwrap_or_default();
return Err(format!("HTTP {}: {}", status, text));
}
let v: Value = resp.json().await.map_err(|e| format!("Invalid JSON response: {}", e))?;
let content = v
.get("choices")
.and_then(|c| c.get(0))
.and_then(|c| c.get("message"))
.and_then(|m| m.get("content"))
.and_then(|c| c.as_str())
.ok_or_else(|| "Invalid API response: missing choices[0].message.content".to_string())?;
Ok(content.to_string())
}
pub fn tool_specs() -> Vec<Value> {
vec![
json!({
"name": "ui_to_artifact",
"description": "Convert UI screenshots into artifacts (code/prompt/spec/description).",
"inputSchema": {
"type": "object",
"properties": {
"image_source": { "type": "string", "description": "Local file path or remote URL to the image" },
"output_type": { "type": "string", "enum": ["code","prompt","spec","description"] },
"prompt": { "type": "string" }
},
"required": ["image_source","output_type","prompt"]
}
}),
json!({
"name": "extract_text_from_screenshot",
"description": "Extract text/code from screenshots (OCR-like).",
"inputSchema": {
"type": "object",
"properties": {
"image_source": { "type": "string" },
"prompt": { "type": "string" },
"language_hint": { "type": "string" }
},
"required": ["image_source","prompt"]
}
}),
json!({
"name": "diagnose_error_screenshot",
"description": "Diagnose error screenshots (stack traces, logs, runtime errors).",
"inputSchema": {
"type": "object",
"properties": {
"image_source": { "type": "string" },
"prompt": { "type": "string" },
"context": { "type": "string" }
},
"required": ["image_source","prompt"]
}
}),
json!({
"name": "understand_technical_diagram",
"description": "Analyze architecture/flow/UML/ER diagrams.",
"inputSchema": {
"type": "object",
"properties": {
"image_source": { "type": "string" },
"prompt": { "type": "string" },
"diagram_type": { "type": "string" }
},
"required": ["image_source","prompt"]
}
}),
json!({
"name": "analyze_data_visualization",
"description": "Analyze charts/dashboards to extract insights and trends.",
"inputSchema": {
"type": "object",
"properties": {
"image_source": { "type": "string" },
"prompt": { "type": "string" },
"analysis_focus": { "type": "string" }
},
"required": ["image_source","prompt"]
}
}),
json!({
"name": "ui_diff_check",
"description": "Compare two UI screenshots and report visual differences.",
"inputSchema": {
"type": "object",
"properties": {
"expected_image_source": { "type": "string" },
"actual_image_source": { "type": "string" },
"prompt": { "type": "string" }
},
"required": ["expected_image_source","actual_image_source","prompt"]
}
}),
json!({
"name": "analyze_image",
"description": "General-purpose image analysis.",
"inputSchema": {
"type": "object",
"properties": {
"image_source": { "type": "string" },
"prompt": { "type": "string" }
},
"required": ["image_source","prompt"]
}
}),
json!({
"name": "analyze_video",
"description": "Analyze video content.",
"inputSchema": {
"type": "object",
"properties": {
"video_source": { "type": "string" },
"prompt": { "type": "string" }
},
"required": ["video_source","prompt"]
}
}),
]
}
pub async fn call_tool(
zai: &ZaiConfig,
upstream_proxy: UpstreamProxyConfig,
timeout_secs: u64,
tool_name: &str,
arguments: &Value,
) -> Result<Value, String> {
let api_key = zai.api_key.trim();
if api_key.is_empty() {
return Err("z.ai api_key is missing".to_string());
}
let client = build_client(upstream_proxy, timeout_secs)?;
let tool_result = match tool_name {
"ui_to_artifact" => {
let image_source = arguments
.get("image_source")
.and_then(|v| v.as_str())
.ok_or("Missing image_source")?;
let output_type = arguments
.get("output_type")
.and_then(|v| v.as_str())
.ok_or("Missing output_type")?;
let prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?;
let system_prompt = match output_type {
"code" => "You are a frontend engineer. Generate clean, accessible, responsive frontend code from the UI screenshot.",
"prompt" => "You generate precise prompts to recreate UI screenshots.",
"spec" => "You are a design systems architect. Produce a detailed UI specification from the screenshot.",
"description" => "You describe UI screenshots clearly and completely in natural language.",
_ => return Err("Invalid output_type".to_string()),
};
let image = image_source_to_content(image_source, 5)?;
vision_chat_completion(&client, api_key, system_prompt, vec![image], prompt).await?
}
"extract_text_from_screenshot" => {
let image_source = arguments
.get("image_source")
.and_then(|v| v.as_str())
.ok_or("Missing image_source")?;
let mut prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?.to_string();
if let Some(lang) = arguments.get("language_hint").and_then(|v| v.as_str()) {
if !lang.trim().is_empty() {
prompt.push_str(&format!("\n\nLanguage hint: {}", lang.trim()));
}
}
let image = image_source_to_content(image_source, 5)?;
let system_prompt = "Extract text from the screenshot accurately. Preserve code formatting. If unsure, say what is uncertain.";
vision_chat_completion(&client, api_key, system_prompt, vec![image], &prompt).await?
}
"diagnose_error_screenshot" => {
let image_source = arguments
.get("image_source")
.and_then(|v| v.as_str())
.ok_or("Missing image_source")?;
let mut prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?.to_string();
if let Some(ctx) = arguments.get("context").and_then(|v| v.as_str()) {
if !ctx.trim().is_empty() {
prompt.push_str(&format!("\n\nContext: {}", ctx.trim()));
}
}
let image = image_source_to_content(image_source, 5)?;
let system_prompt = "Diagnose the error shown in the screenshot. Identify root cause, propose fixes and verification steps.";
vision_chat_completion(&client, api_key, system_prompt, vec![image], &prompt).await?
}
"understand_technical_diagram" => {
let image_source = arguments
.get("image_source")
.and_then(|v| v.as_str())
.ok_or("Missing image_source")?;
let mut prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?.to_string();
if let Some(diagram_type) = arguments.get("diagram_type").and_then(|v| v.as_str()) {
if !diagram_type.trim().is_empty() {
prompt.push_str(&format!("\n\nDiagram type: {}", diagram_type.trim()));
}
}
let image = image_source_to_content(image_source, 5)?;
let system_prompt = "Explain the technical diagram. Describe components, relationships, data flows, and key assumptions.";
vision_chat_completion(&client, api_key, system_prompt, vec![image], &prompt).await?
}
"analyze_data_visualization" => {
let image_source = arguments
.get("image_source")
.and_then(|v| v.as_str())
.ok_or("Missing image_source")?;
let mut prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?.to_string();
if let Some(focus) = arguments.get("analysis_focus").and_then(|v| v.as_str()) {
if !focus.trim().is_empty() {
prompt.push_str(&format!("\n\nFocus: {}", focus.trim()));
}
}
let image = image_source_to_content(image_source, 5)?;
let system_prompt = "Analyze the chart/dashboard and extract insights, trends, anomalies, and recommendations.";
vision_chat_completion(&client, api_key, system_prompt, vec![image], &prompt).await?
}
"ui_diff_check" => {
let expected = arguments
.get("expected_image_source")
.and_then(|v| v.as_str())
.ok_or("Missing expected_image_source")?;
let actual = arguments
.get("actual_image_source")
.and_then(|v| v.as_str())
.ok_or("Missing actual_image_source")?;
let prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?;
let expected_img = image_source_to_content(expected, 5)?;
let actual_img = image_source_to_content(actual, 5)?;
let system_prompt = "Compare the two UI screenshots and report differences grouped by severity. Include actionable fix suggestions.";
vision_chat_completion(
&client,
api_key,
system_prompt,
vec![expected_img, actual_img],
prompt,
)
.await?
}
"analyze_image" => {
let image_source = arguments
.get("image_source")
.and_then(|v| v.as_str())
.ok_or("Missing image_source")?;
let prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?;
let image = image_source_to_content(image_source, 5)?;
let system_prompt = "Analyze the image. Be precise and include relevant details.";
vision_chat_completion(&client, api_key, system_prompt, vec![image], prompt).await?
}
"analyze_video" => {
let video_source = arguments
.get("video_source")
.and_then(|v| v.as_str())
.ok_or("Missing video_source")?;
let prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?;
let video = video_source_to_content(video_source, 8)?;
let system_prompt = "Analyze the video content according to the user's request.";
vision_chat_completion(&client, api_key, system_prompt, vec![video], prompt).await?
}
_ => return Err("Unknown tool".to_string()),
};
Ok(json!({
"content": [
{ "type": "text", "text": tool_result }
]
}))
}