tgetsov commited on
Commit
a804ded
·
verified ·
1 Parent(s): a411975

Upload validate.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. validate.sh +61 -0
validate.sh ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # marvy-14B quick validation harness.
3
+ #
4
+ # Runs the task-coverage probes (Test 2 in VALIDATION.md) against any
5
+ # OpenAI-compatible endpoint — LM Studio, mlx_lm server, vLLM, etc. — and prints
6
+ # each artifact plus a lightweight heuristic PASS/FAIL on domain keywords.
7
+ #
8
+ # Usage:
9
+ # bash validate.sh # defaults to LM Studio
10
+ # BASE_URL=http://localhost:8080/v1 MODEL=marvy-14B bash validate.sh
11
+ # API_KEY=xxx BASE_URL=https://... MODEL=MainStack/marvy-14B bash validate.sh
12
+ set -uo pipefail
13
+
14
+ BASE_URL="${BASE_URL:-http://localhost:1234/v1}" # LM Studio default
15
+ MODEL="${MODEL:-marvy-14B}"
16
+ API_KEY="${API_KEY:-lm-studio}"
17
+ TEMP="${TEMP:-0.4}"
18
+ MAXTOK="${MAXTOK:-700}"
19
+
20
+ SYSTEM="You are a senior ServiceNow delivery consultant. You produce precise, implementation-grade artifacts: business analyses, requirements, solution design documents, user stories with acceptance criteria, test cases, and validation reviews. You favor out-of-the-box capabilities, cite concrete tables/plugins/sys_ids when relevant, and write in clear professional English."
21
+
22
+ # probe | expected-keyword-regex (case-insensitive) for a heuristic pass
23
+ PROMPTS=(
24
+ "Write a user story with acceptance criteria for auto-escalating P1 incidents that breach a 15-minute response SLA.|as a.*i want.*so that|acceptance|sla"
25
+ "Draft the Incident Management section of an SDD for a greenfield ITSM implementation. Include assignment rules and SLA design.|assignment|sla|incident"
26
+ "Extract structured requirements (id, category, priority, target phase, success metric) from: replace email-based access requests with a catalog item routed for manager approval.|priority|requirement|catalog"
27
+ "Write a test case for the story: Restrict the Assignment Group field on incidents to groups with the itil role.|pre-condition|step|expected|itil"
28
+ "Validate this requirement against best practice and list follow-up questions: All incidents must auto-close after 3 days.|follow-up|risk|question"
29
+ )
30
+
31
+ command -v jq >/dev/null 2>&1 || { echo "ERROR: jq is required (brew install jq)"; exit 1; }
32
+
33
+ echo "Endpoint: $BASE_URL Model: $MODEL Temp: $TEMP"
34
+ echo "============================================================"
35
+ pass=0; total=0
36
+ for entry in "${PROMPTS[@]}"; do
37
+ total=$((total+1))
38
+ prompt="${entry%%|*}"
39
+ rest="${entry#*|}"; regex="$rest"
40
+ payload=$(jq -n --arg m "$MODEL" --arg s "$SYSTEM" --arg p "$prompt" \
41
+ --argjson t "$TEMP" --argjson mx "$MAXTOK" \
42
+ '{model:$m,temperature:$t,max_tokens:$mx,messages:[{role:"system",content:$s},{role:"user",content:$p}]}')
43
+ resp=$(curl -s "$BASE_URL/chat/completions" -H "Content-Type: application/json" \
44
+ -H "Authorization: Bearer $API_KEY" -d "$payload")
45
+ content=$(echo "$resp" | jq -r '.choices[0].message.content // .error.message // "<<no response>>"')
46
+ echo ""
47
+ echo "### Probe $total: $prompt"
48
+ echo "------------------------------------------------------------"
49
+ echo "$content" | head -40
50
+ if echo "$content" | grep -iqE "$regex"; then
51
+ echo ">>> heuristic: PASS"
52
+ pass=$((pass+1))
53
+ else
54
+ echo ">>> heuristic: REVIEW (expected pattern not matched: $regex)"
55
+ fi
56
+ echo "============================================================"
57
+ done
58
+ echo ""
59
+ echo "Heuristic result: $pass/$total probes matched domain patterns."
60
+ echo "Pass threshold: >= 4/5 with implementation-grade, ServiceNow-specific content."
61
+ echo "Note: heuristics are a sanity check — read the outputs to judge true quality."