File size: 3,087 Bytes
d4b664a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from sqlalchemy import Column, Integer, String, Float, Boolean, DateTime, Text, JSON
from sqlalchemy.ext.declarative import declarative_base
from datetime import datetime

Base = declarative_base()


class ConversationLog(Base):
    __tablename__ = "conversation_logs"

    id = Column(Integer, primary_key=True, index=True)
    session_id = Column(String(64), index=True)
    query = Column(Text, nullable=False)
    response = Column(Text, nullable=False)
    category = Column(String(64))
    prompt_version = Column(String(32), default="standard")
    retry_count = Column(Integer, default=0)
    created_at = Column(DateTime, default=datetime.utcnow)


class EvaluationResult(Base):
    __tablename__ = "evaluation_results"

    id = Column(Integer, primary_key=True, index=True)
    conversation_id = Column(Integer, index=True)
    session_id = Column(String(64), index=True)
    query = Column(Text, nullable=False)
    response = Column(Text, nullable=False)

    # Code eval
    code_eval_result = Column(String(16))  # PASS/FAIL
    code_eval_details = Column(Text)

    # LLM Judge scores
    policy_compliance = Column(Float, default=0.0)
    faithfulness = Column(Float, default=0.0)
    relevance = Column(Float, default=0.0)
    tone = Column(Float, default=0.0)
    correctness = Column(Float, default=0.0)
    judge_verdict = Column(String(16))
    judge_reasoning = Column(Text)

    # Hallucination
    hallucination_detected = Column(Boolean, default=False)
    hallucination_details = Column(Text)
    hallucination_severity = Column(String(16), default="none")

    # Trust score
    trust_score = Column(Float, default=0.0)
    final_verdict = Column(String(16))  # PASS/FAIL

    # Retry
    is_retry = Column(Boolean, default=False)
    original_eval_id = Column(Integer, nullable=True)
    retry_trust_score = Column(Float, nullable=True)
    score_improvement = Column(Float, nullable=True)

    # Deployment
    deployment_ready = Column(Boolean, default=False)

    created_at = Column(DateTime, default=datetime.utcnow)


class FailureLog(Base):
    __tablename__ = "failure_logs"

    id = Column(Integer, primary_key=True, index=True)
    eval_id = Column(Integer, index=True)
    session_id = Column(String(64))
    query = Column(Text)
    response = Column(Text)
    primary_failure_reason = Column(Text)
    policy_violations = Column(JSON)
    hallucinations = Column(JSON)
    improvement_suggestions = Column(JSON)
    corrected_response = Column(Text)
    severity = Column(String(16))
    created_at = Column(DateTime, default=datetime.utcnow)


class PromptOptimizationLog(Base):
    __tablename__ = "prompt_optimization_logs"

    id = Column(Integer, primary_key=True, index=True)
    session_id = Column(String(64))
    query = Column(Text)
    original_response = Column(Text)
    original_trust_score = Column(Float)
    optimized_response = Column(Text)
    optimized_trust_score = Column(Float)
    score_improvement = Column(Float)
    prompt_version_used = Column(String(32))
    created_at = Column(DateTime, default=datetime.utcnow)