Ram Narayanan commited on
Commit
cfeeaa8
·
1 Parent(s): 9953400

Added some generated minimal sft dataset and updated dockerfile

Browse files
Files changed (3) hide show
  1. Dockerfile +6 -74
  2. generate_sft_content.py +151 -0
  3. sft_data.json +0 -0
Dockerfile CHANGED
@@ -1,80 +1,12 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- # Multi-stage build using openenv-base
8
- # This Dockerfile is flexible and works for both:
9
- # - In-repo environments (with local OpenEnv sources)
10
- # - Standalone environments (with openenv from PyPI/Git)
11
- # The build script (openenv build) handles context detection and sets appropriate build args.
12
-
13
- ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
- FROM ${BASE_IMAGE} AS builder
15
 
16
  WORKDIR /app
17
 
18
- # Ensure git is available (required for installing dependencies from VCS)
19
- RUN apt-get update && \
20
- apt-get install -y --no-install-recommends git && \
21
- rm -rf /var/lib/apt/lists/*
22
-
23
- # Build argument to control whether we're building standalone or in-repo
24
- ARG BUILD_MODE=in-repo
25
- ARG ENV_NAME=customer_env
26
-
27
- # Copy environment code (always at root of build context)
28
- COPY . /app/env
29
-
30
- # For in-repo builds, openenv is already vendored in the build context
31
- # For standalone builds, openenv will be installed via pyproject.toml
32
- WORKDIR /app/env
33
-
34
- # Ensure uv is available (for local builds where base image lacks it)
35
- RUN if ! command -v uv >/dev/null 2>&1; then \
36
- curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
- mv /root/.local/bin/uv /usr/local/bin/uv && \
38
- mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
- fi
40
-
41
- # Install dependencies using uv sync
42
- # If uv.lock exists, use it; otherwise resolve on the fly
43
- RUN --mount=type=cache,target=/root/.cache/uv \
44
- if [ -f uv.lock ]; then \
45
- uv sync --frozen --no-install-project --no-editable; \
46
- else \
47
- uv sync --no-install-project --no-editable; \
48
- fi
49
-
50
- RUN --mount=type=cache,target=/root/.cache/uv \
51
- if [ -f uv.lock ]; then \
52
- uv sync --frozen --no-editable; \
53
- else \
54
- uv sync --no-editable; \
55
- fi
56
-
57
- # Final runtime stage
58
- FROM ${BASE_IMAGE}
59
-
60
- WORKDIR /app
61
-
62
- # Copy the virtual environment from builder
63
- COPY --from=builder /app/env/.venv /app/.venv
64
-
65
- # Copy the environment code
66
- COPY --from=builder /app/env /app/env
67
-
68
- # Set PATH to use the virtual environment
69
- ENV PATH="/app/.venv/bin:$PATH"
70
 
71
- # Set PYTHONPATH so imports work correctly
72
- ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
 
74
- # Health check
75
- HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
- CMD curl -f http://localhost:8000/health || exit 1
77
 
78
- # Run the FastAPI server
79
- # The module path is constructed to work with the /app/env structure
80
- CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
 
1
+ FROM python:3.10-slim
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  WORKDIR /app
4
 
5
+ COPY pyproject.toml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ RUN pip install --no-cache-dir "openenv-core[core]>=0.1.0" "fastapi>=0.115.0" "pydantic>=2.0.0" "uvicorn>=0.24.0" "requests>=2.31.0"
 
8
 
9
+ COPY . .
10
+ EXPOSE 7860
 
11
 
12
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
generate_sft_content.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ import google.generativeai as genai
5
+
6
+ # 1. Configure Gemini API
7
+ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
8
+ genai.configure(api_key=GEMINI_API_KEY)
9
+
10
+ model = genai.GenerativeModel('gemini-2.5-flash')
11
+
12
+ # 2. Define the Intents
13
+ INTENTS = [
14
+ "dispute_charge: $50 at CoffeeCloud",
15
+ "travel_notice: going to Japan",
16
+ "card_replacement: lost at gym",
17
+ "check_balance: current checking account",
18
+ "increase_limit: needs $5000 for wedding",
19
+ "reset_password: locked out of app",
20
+ "stealth_dispute: Customer sees a $215.50 charge from 'TechStore Online', but they still possess their physical card. They suspect details were stolen online.",
21
+ "urgent_freeze: Customer lost their wallet on the subway 10 minutes ago. They are panicking and need all cards frozen immediately.",
22
+ "app_glitch: Customer is locked out of the mobile app because it keeps crashing on startup. Their account is actually fine, but they think they were hacked.",
23
+ "fee_negotiation: Customer is furious about a $35 overdraft fee because their direct deposit was 1 day late. They are threatening to close the account if it isn't waived.",
24
+ "out_of_bounds: Customer wants to negotiate a lower interest rate on their mortgage because a competitor offered them 6.2%. The AI is not authorized to do this.",
25
+ "bounced_rent: Customer tried to wire $1,200 to their landlord for rent, but the recipient claims they never got it. Customer is extremely stressed about eviction."
26
+ ]
27
+
28
+ def generate_raw_transcript(intent: str) -> str:
29
+ """Forces Gemini to write a perfect, successful support call using complex tools."""
30
+ prompt = f"""Write a highly realistic, multi-turn call center transcript for a banking customer with this intent: {intent}.
31
+
32
+ RULES:
33
+ 1. Format for Agent speaking: Agent_Speak: <text>
34
+ 2. Format for Agent tool use: Agent_Tool: <tool_name> | <json_args>
35
+ 3. Format for Customer: Customer: <text>
36
+ 4. Format for System: System: <text> (Show the mock JSON result of a tool call)
37
+ 5. THE KYC BOTTLENECK: The Agent MUST ask for a phone number to use `fetch_account_info`, AND THEN ask for a security PIN to use `verify_kyc` BEFORE making any account changes or revealing sensitive data!
38
+ 6. Do NOT solve the issue in one turn. The Agent must probe and investigate.
39
+ 7. DO NOT use markdown code blocks. Return ONLY raw text.
40
+
41
+ AVAILABLE TOOLS:
42
+ - fetch_account_info | {{"phone_number": "string"}}
43
+ - verify_kyc | {{"account_id": "string", "security_pin": "string"}}
44
+ - query_transactions | {{"account_id": "string", "days_back": "integer"}}
45
+ - execute_account_action | {{"account_id": "string", "action_type": "string", "amount": "float", "notes": "string"}}
46
+ - update_card_status | {{"account_id": "string", "status": "string"}}
47
+ - escalate_to_human | {{"department": "string", "summary": "string"}}
48
+
49
+ EXAMPLE WORKFLOW:
50
+ Agent_Speak: Welcome to the bank. Can I get your phone number?
51
+ Customer: It's 555-0192.
52
+ Agent_Tool: fetch_account_info | {{"phone_number": "555-0192"}}
53
+ System: {{"account_id": "ACC-778", "name": "Jane Doe"}}
54
+ Agent_Speak: Thanks Jane. Could you verify your 4-digit security PIN?
55
+ Customer: It is 1234.
56
+ Agent_Tool: verify_kyc | {{"account_id": "ACC-778", "security_pin": "1234"}}
57
+ System: {{"kyc_status": "passed"}}
58
+ Agent_Speak: Thank you. How can I help you today?
59
+
60
+ Now, write a successful 6-12 turn transcript for the intent: {intent}.
61
+ Start with "System: Call connected."
62
+ """
63
+
64
+ response = model.generate_content(
65
+ prompt,
66
+ generation_config=genai.types.GenerationConfig(
67
+ temperature=0.6,
68
+ max_output_tokens=1500,
69
+ )
70
+ )
71
+ return response.text
72
+
73
+ def parse_transcript_to_sharegpt(transcript: str) -> dict:
74
+ """Converts the text transcript into the JSON format needed for Unsloth SFT."""
75
+ conversation = []
76
+
77
+ clean_transcript = transcript.replace("```text", "").replace("```", "").strip()
78
+ lines = clean_transcript.split('\n')
79
+
80
+ current_user_msg = ""
81
+
82
+ for line in lines:
83
+ line = line.strip()
84
+ if not line: continue
85
+
86
+ if line.startswith("Customer:") or line.startswith("System:"):
87
+ current_user_msg += line + "\n"
88
+
89
+ elif line.startswith("Agent_Speak:"):
90
+ if current_user_msg:
91
+ conversation.append({"role": "user", "content": current_user_msg.strip()})
92
+ current_user_msg = ""
93
+
94
+ content = line.replace("Agent_Speak:", "").strip()
95
+ action_json = {
96
+ "action_type": "speak",
97
+ "content": content,
98
+ "tool_args": {}
99
+ }
100
+ conversation.append({"role": "assistant", "content": json.dumps(action_json)})
101
+
102
+ elif line.startswith("Agent_Tool:"):
103
+ if current_user_msg:
104
+ conversation.append({"role": "user", "content": current_user_msg.strip()})
105
+ current_user_msg = ""
106
+
107
+ parts = line.replace("Agent_Tool:", "").split("|")
108
+ tool_name = parts[0].strip()
109
+ # Safely parse JSON arguments
110
+ try:
111
+ tool_args = json.loads(parts[1].strip()) if len(parts) > 1 else {}
112
+ except json.JSONDecodeError:
113
+ tool_args = {}
114
+
115
+ action_json = {
116
+ "action_type": "tool_call",
117
+ "content": tool_name,
118
+ "tool_args": tool_args
119
+ }
120
+ conversation.append({"role": "assistant", "content": json.dumps(action_json)})
121
+
122
+ if current_user_msg:
123
+ conversation.append({"role": "user", "content": current_user_msg.strip()})
124
+
125
+ return {"conversations": conversation}
126
+
127
+ def build_dataset(num_samples: int = 50):
128
+ dataset = []
129
+ print(f"Generating {num_samples} synthetic trajectories with Gemini. This will be fast...")
130
+
131
+ for i in range(num_samples):
132
+ intent = random.choice(INTENTS)
133
+ print(f"[{i+1}/{num_samples}] Generating: {intent}")
134
+
135
+ try:
136
+ raw_text = generate_raw_transcript(intent)
137
+ sharegpt_format = parse_transcript_to_sharegpt(raw_text)
138
+
139
+ if len(sharegpt_format["conversations"]) > 2:
140
+ dataset.append(sharegpt_format)
141
+ except Exception as e:
142
+ print(f"Skipping failed generation: {e}")
143
+
144
+ with open("sft_data.json", "w") as f:
145
+ json.dump(dataset, f, indent=2)
146
+
147
+ print(f"✅ Successfully saved {len(dataset)} examples to sft_data.json")
148
+
149
+ if __name__ == "__main__":
150
+ # 50 to 100 is plenty for Unsloth to learn the JSON format!
151
+ build_dataset(num_samples=100)
sft_data.json ADDED
The diff for this file is too large to render. See raw diff