Spaces:

AZILS
/

app

Paused

File size: 17,101 Bytes

a21c316

use base64::Engine;
use serde_json::{json, Value};
use tokio::time::Duration;

use crate::proxy::config::UpstreamProxyConfig;
use crate::proxy::ZaiConfig;

const ZAI_PAAZ_CHAT_COMPLETIONS_URL: &str = "https://api.z.ai/api/paas/v4/chat/completions";

fn build_client(upstream_proxy: UpstreamProxyConfig, timeout_secs: u64) -> Result<reqwest::Client, String> {
    let mut builder = reqwest::Client::builder()
        .timeout(Duration::from_secs(timeout_secs.max(5)));

    if upstream_proxy.enabled && !upstream_proxy.url.is_empty() {
        let url = crate::proxy::config::normalize_proxy_url(&upstream_proxy.url);
        let proxy = reqwest::Proxy::all(&url)
            .map_err(|e| format!("Invalid upstream proxy url: {}", e))?;
        builder = builder.proxy(proxy);
    }

    builder.build().map_err(|e| format!("Failed to build HTTP client: {}", e))
}

fn is_http_url(value: &str) -> bool {
    let v = value.trim();
    v.starts_with("http://") || v.starts_with("https://")
}

fn mime_for_image_extension(ext: &str) -> Option<&'static str> {
    match ext.to_ascii_lowercase().as_str() {
        "png" => Some("image/png"),
        "jpg" | "jpeg" => Some("image/jpeg"),
        _ => None,
    }
}

fn mime_for_video_extension(ext: &str) -> Option<&'static str> {
    match ext.to_ascii_lowercase().as_str() {
        "mp4" => Some("video/mp4"),
        "mov" => Some("video/quicktime"),
        "m4v" => Some("video/x-m4v"),
        _ => None,
    }
}

fn file_ext(path: &std::path::Path) -> Option<String> {
    path.extension()
        .and_then(|s| s.to_str())
        .map(|s| s.to_string())
}

fn encode_file_as_data_url(path: &std::path::Path, mime: &str) -> Result<String, String> {
    let bytes = std::fs::read(path).map_err(|e| format!("Failed to read file: {}", e))?;
    let encoded = base64::engine::general_purpose::STANDARD.encode(bytes);
    Ok(format!("data:{};base64,{}", mime, encoded))
}

fn image_source_to_content(image_source: &str, max_size_mb: u64) -> Result<Value, String> {
    if is_http_url(image_source) {
        return Ok(json!({
            "type": "image_url",
            "image_url": { "url": image_source }
        }));
    }

    let path = std::path::Path::new(image_source);
    let meta = std::fs::metadata(path).map_err(|_| "Image file not found".to_string())?;
    let max_size = max_size_mb * 1024 * 1024;
    if meta.len() > max_size {
        return Err(format!(
            "Image file too large ({} bytes), max {} MB",
            meta.len(),
            max_size_mb
        ));
    }

    let ext = file_ext(path).ok_or("Unsupported image format".to_string())?;
    let mime = mime_for_image_extension(&ext).ok_or("Unsupported image format".to_string())?;
    let data_url = encode_file_as_data_url(path, mime)?;
    Ok(json!({
        "type": "image_url",
        "image_url": { "url": data_url }
    }))
}

fn video_source_to_content(video_source: &str, max_size_mb: u64) -> Result<Value, String> {
    if is_http_url(video_source) {
        return Ok(json!({
            "type": "video_url",
            "video_url": { "url": video_source }
        }));
    }

    let path = std::path::Path::new(video_source);
    let meta = std::fs::metadata(path).map_err(|_| "Video file not found".to_string())?;
    let max_size = max_size_mb * 1024 * 1024;
    if meta.len() > max_size {
        return Err(format!(
            "Video file too large ({} bytes), max {} MB",
            meta.len(),
            max_size_mb
        ));
    }

    let ext = file_ext(path).ok_or("Unsupported video format".to_string())?;
    let mime = mime_for_video_extension(&ext).ok_or("Unsupported video format".to_string())?;
    let data_url = encode_file_as_data_url(path, mime)?;
    Ok(json!({
        "type": "video_url",
        "video_url": { "url": data_url }
    }))
}

fn user_message_with_content(mut content: Vec<Value>, prompt: &str) -> Value {
    content.push(json!({ "type": "text", "text": prompt }));
    json!({ "role": "user", "content": content })
}

async fn vision_chat_completion(
    client: &reqwest::Client,
    api_key: &str,
    system_prompt: &str,
    user_content: Vec<Value>,
    prompt: &str,
) -> Result<String, String> {
    let body = json!({
        "model": "glm-4.6v",
        "messages": [
            { "role": "system", "content": system_prompt },
            user_message_with_content(user_content, prompt),
        ],
        "thinking": { "type": "enabled" },
        "stream": false,
        "temperature": 0.8,
        "top_p": 0.6,
        "max_tokens": 32768
    });

    let resp = client
        .post(ZAI_PAAZ_CHAT_COMPLETIONS_URL)
        .bearer_auth(api_key)
        .header("X-Title", "Vision MCP Local")
        .header("Accept-Language", "en-US,en")
        .json(&body)
        .send()
        .await
        .map_err(|e| format!("Upstream request failed: {}", e))?;

    if !resp.status().is_success() {
        let status = resp.status().as_u16();
        let text = resp.text().await.unwrap_or_default();
        return Err(format!("HTTP {}: {}", status, text));
    }

    let v: Value = resp.json().await.map_err(|e| format!("Invalid JSON response: {}", e))?;
    let content = v
        .get("choices")
        .and_then(|c| c.get(0))
        .and_then(|c| c.get("message"))
        .and_then(|m| m.get("content"))
        .and_then(|c| c.as_str())
        .ok_or_else(|| "Invalid API response: missing choices[0].message.content".to_string())?;

    Ok(content.to_string())
}

pub fn tool_specs() -> Vec<Value> {
    vec![
        json!({
            "name": "ui_to_artifact",
            "description": "Convert UI screenshots into artifacts (code/prompt/spec/description).",
            "inputSchema": {
                "type": "object",
                "properties": {
                    "image_source": { "type": "string", "description": "Local file path or remote URL to the image" },
                    "output_type": { "type": "string", "enum": ["code","prompt","spec","description"] },
                    "prompt": { "type": "string" }
                },
                "required": ["image_source","output_type","prompt"]
            }
        }),
        json!({
            "name": "extract_text_from_screenshot",
            "description": "Extract text/code from screenshots (OCR-like).",
            "inputSchema": {
                "type": "object",
                "properties": {
                    "image_source": { "type": "string" },
                    "prompt": { "type": "string" },
                    "language_hint": { "type": "string" }
                },
                "required": ["image_source","prompt"]
            }
        }),
        json!({
            "name": "diagnose_error_screenshot",
            "description": "Diagnose error screenshots (stack traces, logs, runtime errors).",
            "inputSchema": {
                "type": "object",
                "properties": {
                    "image_source": { "type": "string" },
                    "prompt": { "type": "string" },
                    "context": { "type": "string" }
                },
                "required": ["image_source","prompt"]
            }
        }),
        json!({
            "name": "understand_technical_diagram",
            "description": "Analyze architecture/flow/UML/ER diagrams.",
            "inputSchema": {
                "type": "object",
                "properties": {
                    "image_source": { "type": "string" },
                    "prompt": { "type": "string" },
                    "diagram_type": { "type": "string" }
                },
                "required": ["image_source","prompt"]
            }
        }),
        json!({
            "name": "analyze_data_visualization",
            "description": "Analyze charts/dashboards to extract insights and trends.",
            "inputSchema": {
                "type": "object",
                "properties": {
                    "image_source": { "type": "string" },
                    "prompt": { "type": "string" },
                    "analysis_focus": { "type": "string" }
                },
                "required": ["image_source","prompt"]
            }
        }),
        json!({
            "name": "ui_diff_check",
            "description": "Compare two UI screenshots and report visual differences.",
            "inputSchema": {
                "type": "object",
                "properties": {
                    "expected_image_source": { "type": "string" },
                    "actual_image_source": { "type": "string" },
                    "prompt": { "type": "string" }
                },
                "required": ["expected_image_source","actual_image_source","prompt"]
            }
        }),
        json!({
            "name": "analyze_image",
            "description": "General-purpose image analysis.",
            "inputSchema": {
                "type": "object",
                "properties": {
                    "image_source": { "type": "string" },
                    "prompt": { "type": "string" }
                },
                "required": ["image_source","prompt"]
            }
        }),
        json!({
            "name": "analyze_video",
            "description": "Analyze video content.",
            "inputSchema": {
                "type": "object",
                "properties": {
                    "video_source": { "type": "string" },
                    "prompt": { "type": "string" }
                },
                "required": ["video_source","prompt"]
            }
        }),
    ]
}

pub async fn call_tool(
    zai: &ZaiConfig,
    upstream_proxy: UpstreamProxyConfig,
    timeout_secs: u64,
    tool_name: &str,
    arguments: &Value,
) -> Result<Value, String> {
    let api_key = zai.api_key.trim();
    if api_key.is_empty() {
        return Err("z.ai api_key is missing".to_string());
    }

    let client = build_client(upstream_proxy, timeout_secs)?;

    let tool_result = match tool_name {
        "ui_to_artifact" => {
            let image_source = arguments
                .get("image_source")
                .and_then(|v| v.as_str())
                .ok_or("Missing image_source")?;
            let output_type = arguments
                .get("output_type")
                .and_then(|v| v.as_str())
                .ok_or("Missing output_type")?;
            let prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?;

            let system_prompt = match output_type {
                "code" => "You are a frontend engineer. Generate clean, accessible, responsive frontend code from the UI screenshot.",
                "prompt" => "You generate precise prompts to recreate UI screenshots.",
                "spec" => "You are a design systems architect. Produce a detailed UI specification from the screenshot.",
                "description" => "You describe UI screenshots clearly and completely in natural language.",
                _ => return Err("Invalid output_type".to_string()),
            };

            let image = image_source_to_content(image_source, 5)?;
            vision_chat_completion(&client, api_key, system_prompt, vec![image], prompt).await?
        }
        "extract_text_from_screenshot" => {
            let image_source = arguments
                .get("image_source")
                .and_then(|v| v.as_str())
                .ok_or("Missing image_source")?;
            let mut prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?.to_string();
            if let Some(lang) = arguments.get("language_hint").and_then(|v| v.as_str()) {
                if !lang.trim().is_empty() {
                    prompt.push_str(&format!("\n\nLanguage hint: {}", lang.trim()));
                }
            }
            let image = image_source_to_content(image_source, 5)?;
            let system_prompt = "Extract text from the screenshot accurately. Preserve code formatting. If unsure, say what is uncertain.";
            vision_chat_completion(&client, api_key, system_prompt, vec![image], &prompt).await?
        }
        "diagnose_error_screenshot" => {
            let image_source = arguments
                .get("image_source")
                .and_then(|v| v.as_str())
                .ok_or("Missing image_source")?;
            let mut prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?.to_string();
            if let Some(ctx) = arguments.get("context").and_then(|v| v.as_str()) {
                if !ctx.trim().is_empty() {
                    prompt.push_str(&format!("\n\nContext: {}", ctx.trim()));
                }
            }
            let image = image_source_to_content(image_source, 5)?;
            let system_prompt = "Diagnose the error shown in the screenshot. Identify root cause, propose fixes and verification steps.";
            vision_chat_completion(&client, api_key, system_prompt, vec![image], &prompt).await?
        }
        "understand_technical_diagram" => {
            let image_source = arguments
                .get("image_source")
                .and_then(|v| v.as_str())
                .ok_or("Missing image_source")?;
            let mut prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?.to_string();
            if let Some(diagram_type) = arguments.get("diagram_type").and_then(|v| v.as_str()) {
                if !diagram_type.trim().is_empty() {
                    prompt.push_str(&format!("\n\nDiagram type: {}", diagram_type.trim()));
                }
            }
            let image = image_source_to_content(image_source, 5)?;
            let system_prompt = "Explain the technical diagram. Describe components, relationships, data flows, and key assumptions.";
            vision_chat_completion(&client, api_key, system_prompt, vec![image], &prompt).await?
        }
        "analyze_data_visualization" => {
            let image_source = arguments
                .get("image_source")
                .and_then(|v| v.as_str())
                .ok_or("Missing image_source")?;
            let mut prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?.to_string();
            if let Some(focus) = arguments.get("analysis_focus").and_then(|v| v.as_str()) {
                if !focus.trim().is_empty() {
                    prompt.push_str(&format!("\n\nFocus: {}", focus.trim()));
                }
            }
            let image = image_source_to_content(image_source, 5)?;
            let system_prompt = "Analyze the chart/dashboard and extract insights, trends, anomalies, and recommendations.";
            vision_chat_completion(&client, api_key, system_prompt, vec![image], &prompt).await?
        }
        "ui_diff_check" => {
            let expected = arguments
                .get("expected_image_source")
                .and_then(|v| v.as_str())
                .ok_or("Missing expected_image_source")?;
            let actual = arguments
                .get("actual_image_source")
                .and_then(|v| v.as_str())
                .ok_or("Missing actual_image_source")?;
            let prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?;

            let expected_img = image_source_to_content(expected, 5)?;
            let actual_img = image_source_to_content(actual, 5)?;
            let system_prompt = "Compare the two UI screenshots and report differences grouped by severity. Include actionable fix suggestions.";
            vision_chat_completion(
                &client,
                api_key,
                system_prompt,
                vec![expected_img, actual_img],
                prompt,
            )
            .await?
        }
        "analyze_image" => {
            let image_source = arguments
                .get("image_source")
                .and_then(|v| v.as_str())
                .ok_or("Missing image_source")?;
            let prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?;
            let image = image_source_to_content(image_source, 5)?;
            let system_prompt = "Analyze the image. Be precise and include relevant details.";
            vision_chat_completion(&client, api_key, system_prompt, vec![image], prompt).await?
        }
        "analyze_video" => {
            let video_source = arguments
                .get("video_source")
                .and_then(|v| v.as_str())
                .ok_or("Missing video_source")?;
            let prompt = arguments.get("prompt").and_then(|v| v.as_str()).ok_or("Missing prompt")?;
            let video = video_source_to_content(video_source, 8)?;
            let system_prompt = "Analyze the video content according to the user's request.";
            vision_chat_completion(&client, api_key, system_prompt, vec![video], prompt).await?
        }
        _ => return Err("Unknown tool".to_string()),
    };

    Ok(json!({
        "content": [
            { "type": "text", "text": tool_result }
        ]
    }))
}