File size: 11,844 Bytes
5df55ff
 
 
 
 
8fab536
5df55ff
 
 
8fab536
5df55ff
 
8fab536
 
 
 
 
 
5df55ff
 
8fab536
 
 
 
 
 
 
5df55ff
8fab536
 
 
 
 
 
5df55ff
 
 
 
 
 
 
 
 
8fab536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5df55ff
 
 
 
 
 
 
 
 
 
8fab536
 
5df55ff
8fab536
5df55ff
8fab536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5df55ff
8fab536
5df55ff
8fab536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5df55ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
"""JSON parsing utilities with safe fallbacks."""

from __future__ import annotations

import json
import logging
import re
from typing import Any

logger = logging.getLogger(__name__)


def strip_markdown_fences(text: str) -> str:
    """Remove markdown code fences and trim surrounding whitespace."""
    if not text:
        return ""
    stripped = text.strip()
    fenced = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", stripped, re.IGNORECASE)
    if fenced:
        return fenced.group(1).strip()
    # Strip lone opening/closing fence lines
    lines = stripped.splitlines()
    if lines and lines[0].strip().startswith("```"):
        lines = lines[1:]
    if lines and lines[-1].strip() == "```":
        lines = lines[:-1]
    return "\n".join(lines).strip()


def _balanced_blocks(text: str, opener: str, closer: str) -> list[str]:
    """Return all balanced opener/closer blocks found in text."""
    blocks: list[str] = []
    for start in range(len(text)):
        if text[start] != opener:
            continue
        depth = 0
        for index in range(start, len(text)):
            char = text[index]
            if char == opener:
                depth += 1
            elif char == closer:
                depth -= 1
                if depth == 0:
                    blocks.append(text[start : index + 1])
                    break
    return blocks


def extract_largest_json_object(text: str) -> str | None:
    """Extract the largest parseable JSON object from mixed model output."""
    if not text:
        return None

    cleaned = strip_markdown_fences(text)
    candidates = _balanced_blocks(cleaned, "{", "}")
    if not candidates:
        return None

    # Prefer the largest block that parses cleanly
    for block in sorted(candidates, key=len, reverse=True):
        try:
            parsed = json.loads(block)
            if isinstance(parsed, dict):
                return block
        except json.JSONDecodeError:
            continue

    # Fall back to largest balanced block even if not yet parseable
    return max(candidates, key=len)


def extract_json_block(text: str) -> str | None:
    """Extract the largest JSON object block from text (legacy name, improved behavior)."""
    if not text:
        return None
    return extract_largest_json_object(text)


def sanitize_for_log(text: str, limit: int = 200) -> str:
    """Return a safe preview string for debug logs (no secrets, truncated)."""
    preview = strip_markdown_fences(text or "")
    preview = re.sub(r"\s+", " ", preview).strip()
    return preview[:limit]


def safe_json_parse(text: str, default: Any = None) -> Any:
    """Parse JSON from raw text, attempting block extraction on failure."""
    if default is None:
        default = {}

    if not text:
        return default

    cleaned = strip_markdown_fences(text)

    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        pass

    block = extract_largest_json_object(cleaned)
    if not block:
        return default
    try:
        return json.loads(block)
    except json.JSONDecodeError:
        return default


def ends_abruptly(text: str) -> bool:
    """Return True if text looks cut off mid-sentence."""
    t = (text or "").strip()
    if not t:
        return True
    if t[-1] in ".!?":
        return False
    if len(t) < 50:
        return True
    last_word = t.split()[-1] if t.split() else ""
    return len(last_word) <= 2 and len(t) < 80


def normalize_parsed_root(parsed: Any) -> dict[str, Any] | None:
    """Unwrap array-wrapped or nested model JSON into a single object."""
    if isinstance(parsed, dict):
        return parsed
    if isinstance(parsed, list):
        for item in parsed:
            if isinstance(item, dict) and item:
                return item
    return None


def extract_partial_string_fields(text: str, keys: list[str]) -> dict[str, str]:
    """Best-effort regex extraction of string fields from truncated JSON."""
    if not text:
        return {}
    cleaned = strip_markdown_fences(text)
    found: dict[str, str] = {}
    for key in keys:
        pattern = rf'"{re.escape(key)}"\s*:\s*"((?:[^"\\]|\\.)*)"'
        match = re.search(pattern, cleaned, re.DOTALL)
        if match:
            try:
                found[key] = json.loads(f'"{match.group(1)}"')
            except json.JSONDecodeError:
                found[key] = match.group(1).replace('\\"', '"').strip()
    return found


def extract_partial_string_list(text: str, key: str, min_items: int = 1) -> list[str]:
    """Extract a JSON string array field from truncated output."""
    if not text:
        return []
    cleaned = strip_markdown_fences(text)
    match = re.search(rf'"{re.escape(key)}"\s*:\s*\[([\s\S]*?)\]', cleaned)
    if not match:
        return []
    items: list[str] = []
    for item_match in re.finditer(r'"((?:[^"\\]|\\.)*)"', match.group(1)):
        try:
            items.append(json.loads(f'"{item_match.group(1)}"'))
        except json.JSONDecodeError:
            items.append(item_match.group(1).replace('\\"', '"').strip())
    return [i for i in items if i][:max(min_items, 8)]


def parse_json_object(
    text: str,
    reasoning_fallback: str | None = None,
    string_fields: list[str] | None = None,
) -> dict[str, Any]:
    """Parse model output into a dict using multiple extraction strategies."""
    parsed, _ = parse_model_json(text, reasoning_fallback=reasoning_fallback)
    root = normalize_parsed_root(parsed)
    if root:
        return root

    partial = extract_partial_string_fields(text, string_fields or [])
    if partial:
        return partial

    fallback = safe_json_parse(text)
    root = normalize_parsed_root(fallback)
    return root if root else {}


def parse_model_json(
    text: str,
    reasoning_fallback: str | None = None,
) -> tuple[Any, bool]:
    """Parse model JSON output with extraction fallbacks.

    Returns (parsed_value, repair_needed).
    repair_needed is True when direct parse failed and extraction/reasoning was used.
    """
    default: dict[str, Any] = {}
    if not text and not reasoning_fallback:
        return default, False

    content = strip_markdown_fences(text or "")
    repair_needed = False

    if content:
        try:
            parsed = json.loads(content)
            if isinstance(parsed, dict):
                return parsed, False
            if isinstance(parsed, list) and len(parsed) == 1 and isinstance(parsed[0], dict):
                return parsed[0], True
            if isinstance(parsed, list):
                return parsed, True
        except json.JSONDecodeError:
            repair_needed = True

        block = extract_largest_json_object(content)
        if block:
            try:
                parsed = json.loads(block)
                if isinstance(parsed, (dict, list)):
                    return parsed, repair_needed
            except json.JSONDecodeError:
                pass

    if reasoning_fallback:
        fb = strip_markdown_fences(reasoning_fallback)
        block = extract_largest_json_object(fb)
        if block:
            try:
                parsed = json.loads(block)
                if isinstance(parsed, (dict, list)):
                    logger.info(
                        "json_utils: parsed JSON from reasoning_content fallback (len=%d)",
                        len(fb),
                    )
                    return parsed, True
            except json.JSONDecodeError:
                pass

    return default, True


def fallback_scorecard() -> dict[str, Any]:
    """Return a minimal scorecard when model JSON parsing fails."""
    return {
        "overall": 0,
        "scores": {},
        "best_answer": "No scorecard could be generated.",
        "weakest_answer": "",
        "improved_answer": "",
        "improved_pitch": "",
        "top_3_questions": [],
    }


_REQUIRED_SCORECARD_DIMS = {
    "clarity",
    "problem_understanding",
    "market_awareness",
    "differentiation",
    "business_model",
    "objection_handling",
}


def _coerce_score(value: Any) -> int:
    """Clamp a raw score value to integer 0–100."""
    try:
        return max(0, min(100, int(float(value))))
    except (TypeError, ValueError):
        return 0


def _score_label(score: int) -> str:
    """Map an integer score 0–100 to a human-readable label.

    Phase 5C bands (claim-based calibration):
      0–30:   Not addressed
      31–50:  Developing
      51–70:  Solid
      71–85:  Strong
      86–100: Excellent
    """
    if score <= 30:
        return "Not addressed"
    if score <= 50:
        return "Developing"
    if score <= 70:
        return "Solid"
    if score <= 85:
        return "Strong"
    return "Excellent"


def _validate_dim(raw: Any) -> dict[str, Any]:
    """Normalise a raw score dimension into {score, label, reason, quote, signals_used}."""
    if not isinstance(raw, dict):
        return {
            "score": 0,
            "label": _score_label(0),
            "reason": "No data.",
            "quote": "",
            "signals_used": [],
        }
    score = _coerce_score(raw.get("score", 0))
    raw_signals = raw.get("signals_used", [])
    signals = (
        [str(s).strip() for s in raw_signals if str(s).strip()]
        if isinstance(raw_signals, list)
        else []
    )
    return {
        "score": score,
        "label": _score_label(score),
        "reason": str(raw.get("reason", "")).strip() or "No reasoning provided.",
        "quote": str(raw.get("quote", "")).strip(),
        "signals_used": signals[:8],
    }


def parse_scorecard_json(raw_text: str) -> dict[str, Any] | None:
    """Parse and validate Nemotron scorecard JSON.

    Fallback order:
      1. json.loads(raw_text)
      2. extract_json_block → json.loads
      3. safe_json_parse

    Returns a validated dict with all required keys, or None if parsing fails
    completely so the caller can fall back to mock_scorecard.

    Voice mode note:
      This function is input-source agnostic — it receives only the text
      output from the model and does not need to change for voice mode.
    """
    parsed = safe_json_parse(raw_text)
    if not parsed or not isinstance(parsed, dict):
        return None

    # Validate and normalise scores dict
    raw_scores = parsed.get("scores", {})
    if not isinstance(raw_scores, dict):
        raw_scores = {}

    scores: dict[str, Any] = {}
    for dim in _REQUIRED_SCORECARD_DIMS:
        scores[dim] = _validate_dim(raw_scores.get(dim))

    # overall: prefer explicit field, else average of dimension scores
    if "overall" in parsed and parsed["overall"] is not None:
        overall = _coerce_score(parsed["overall"])
    else:
        dim_scores = [scores[d]["score"] for d in _REQUIRED_SCORECARD_DIMS]
        overall = round(sum(dim_scores) / len(dim_scores)) if dim_scores else 0

    def _str(key: str, default: str = "") -> str:
        return str(parsed.get(key, default)).strip() or default

    def _list_of_str(key: str) -> list[str]:
        val = parsed.get(key, [])
        if isinstance(val, list):
            return [str(v).strip() for v in val if str(v).strip()]
        return []

    top_3 = _list_of_str("top_3_questions")[:3]
    # Pad to 3 if model returned fewer
    while len(top_3) < 3:
        top_3.append("What concrete evidence do you have to support this claim?")

    return {
        "overall": overall,
        "overall_label": _score_label(overall),
        "scores": scores,
        "best_answer": _str("best_answer", "Not identified."),
        "weakest_answer": _str("weakest_answer", "Not identified."),
        "why_weak": _str("why_weak", ""),
        "improved_answer": _str("improved_answer", ""),
        "improved_pitch": _str("improved_pitch", ""),
        "top_3_questions": top_3,
    }