File size: 1,425 Bytes
390cebe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""Data models for dataset entries and field mappings.

These are plain dataclasses so they can live inside a gr.State without any
serialization step - Gradio keeps server-side state as real Python objects
per session, so we just mutate them in place.
"""
from __future__ import annotations

import uuid
from dataclasses import dataclass, field
from typing import Literal, Optional

MappingKind = Literal["conversation_list", "flat_pair", "unmapped"]
EntryStatus = Literal["empty", "detecting", "needs_mapping", "ready", "error"]


@dataclass
class FieldMapping:
    """How to pull a (system, user, assistant) triplet out of one raw row.

    `config` holds whatever the given `kind` needs:
      - conversation_list: list_field, role_key, content_key, human_tag, gpt_tag
      - flat_pair: user_field, assistant_field
    """

    kind: MappingKind
    config: dict = field(default_factory=dict)


@dataclass
class DatasetEntry:
    """One row in the dataset-builder list."""

    uid: str = field(default_factory=lambda: uuid.uuid4().hex[:8])
    repo_id: str = ""
    subset: str = ""
    split: str = "train"
    limit: int = 1000
    system_prompt: str = ""

    mapping: Optional[FieldMapping] = None
    detected_columns: list = field(default_factory=list)
    detected_list_info: Optional[dict] = None
    sample_rows: list = field(default_factory=list)

    status: EntryStatus = "empty"
    error_message: str = ""