File size: 3,409 Bytes
a66815f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from sampling import generate_samples
from perturbation import generate_perturbed_samples
from claims import extract_claims, overlap_score, contradiction_penalty
from verification import self_verify
from calibration import calibration_score
from scoring import final_confidence, get_confidence_level_and_reason
from risk import classify_risk
from diagnostics import evaluate_claims, highlight_text
from explainer import generate_explanation
from task_classifier import classify_prompt

def analyze(prompt: str) -> dict:
    # 0. Classify task type BEFORE scoring
    task_type = classify_prompt(prompt)
    print(f"[DEBUG] Task type classified as: {task_type}")

    # 1. Multi-Sampling
    samples = generate_samples(prompt)

    # 2. Prompt Perturbation
    perturbed = generate_perturbed_samples(prompt)

    all_outputs = samples + perturbed

    # 3. Extract Claims (from all outputs for robustness/contradiction)
    claim_sets = [extract_claims(o) for o in all_outputs]
    all_claims_flat = [c for claims in claim_sets for c in claims]

    # Aggregate signals
    robustness = overlap_score(claim_sets)
    contradiction = contradiction_penalty(all_claims_flat)

    # 4. Primary answer = first stable sample
    primary_answer = samples[0] if samples else "Error processing."

    # 5. Verification (whole-answer score)
    verification = self_verify(primary_answer)

    # 6. Calibration
    calibration = calibration_score(primary_answer)

    # 7. Aggregated confidence — task-type aware
    confidence = final_confidence(
        robustness=robustness,
        verifiability=verification,
        calibration=calibration,
        contradiction=contradiction,
        task_type=task_type
    )
    level, reason = get_confidence_level_and_reason(confidence, contradiction, task_type)

    # 8. Risk classification — skip for creative (not meaningful)
    if task_type == "creative":
        risk = "CREATIVE"
    else:
        risk = classify_risk(confidence, contradiction, verification)

    # 9. Claim-level diagnostics on primary answer (skip for creative)
    primary_claims = extract_claims(primary_answer)[:6]
    if task_type == "creative":
        # Claim verification doesn't apply — all claims marked as N/A
        claim_results = [{"claim": c, "status": "creative"} for c in primary_claims]
    else:
        claim_results = evaluate_claims(primary_claims)

    # 10. Highlighted answer (skip for creative)
    if task_type == "creative":
        highlighted_answer = primary_answer
    else:
        highlighted_answer = highlight_text(primary_answer, claim_results)

    # 11. Structured explanation
    reasons = generate_explanation(confidence, risk, claim_results, robustness)

    # 12. Serialize claim_results for API
    claims_analysis = [
        {"text": c["claim"], "status": c["status"]}
        for c in claim_results
    ]

    return {
        "answer": primary_answer,
        "highlighted_answer": highlighted_answer,
        "task_type": task_type,
        "confidence_level": level,
        "risk": risk,
        "score": round(confidence, 4),
        "signals": {
            "robustness": round(robustness, 4),
            "verifiability": round(verification, 4),
            "calibration": round(calibration, 4),
            "contradiction": round(contradiction, 4)
        },
        "claims_analysis": claims_analysis,
        "reasons": reasons
    }