voice_agent / generate_sft_content.py
Ram Narayanan
Added some generated minimal sft dataset and updated dockerfile
cfeeaa8
import os
import json
import random
import google.generativeai as genai
# 1. Configure Gemini API
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-2.5-flash')
# 2. Define the Intents
INTENTS = [
"dispute_charge: $50 at CoffeeCloud",
"travel_notice: going to Japan",
"card_replacement: lost at gym",
"check_balance: current checking account",
"increase_limit: needs $5000 for wedding",
"reset_password: locked out of app",
"stealth_dispute: Customer sees a $215.50 charge from 'TechStore Online', but they still possess their physical card. They suspect details were stolen online.",
"urgent_freeze: Customer lost their wallet on the subway 10 minutes ago. They are panicking and need all cards frozen immediately.",
"app_glitch: Customer is locked out of the mobile app because it keeps crashing on startup. Their account is actually fine, but they think they were hacked.",
"fee_negotiation: Customer is furious about a $35 overdraft fee because their direct deposit was 1 day late. They are threatening to close the account if it isn't waived.",
"out_of_bounds: Customer wants to negotiate a lower interest rate on their mortgage because a competitor offered them 6.2%. The AI is not authorized to do this.",
"bounced_rent: Customer tried to wire $1,200 to their landlord for rent, but the recipient claims they never got it. Customer is extremely stressed about eviction."
]
def generate_raw_transcript(intent: str) -> str:
"""Forces Gemini to write a perfect, successful support call using complex tools."""
prompt = f"""Write a highly realistic, multi-turn call center transcript for a banking customer with this intent: {intent}.
RULES:
1. Format for Agent speaking: Agent_Speak: <text>
2. Format for Agent tool use: Agent_Tool: <tool_name> | <json_args>
3. Format for Customer: Customer: <text>
4. Format for System: System: <text> (Show the mock JSON result of a tool call)
5. THE KYC BOTTLENECK: The Agent MUST ask for a phone number to use `fetch_account_info`, AND THEN ask for a security PIN to use `verify_kyc` BEFORE making any account changes or revealing sensitive data!
6. Do NOT solve the issue in one turn. The Agent must probe and investigate.
7. DO NOT use markdown code blocks. Return ONLY raw text.
AVAILABLE TOOLS:
- fetch_account_info | {{"phone_number": "string"}}
- verify_kyc | {{"account_id": "string", "security_pin": "string"}}
- query_transactions | {{"account_id": "string", "days_back": "integer"}}
- execute_account_action | {{"account_id": "string", "action_type": "string", "amount": "float", "notes": "string"}}
- update_card_status | {{"account_id": "string", "status": "string"}}
- escalate_to_human | {{"department": "string", "summary": "string"}}
EXAMPLE WORKFLOW:
Agent_Speak: Welcome to the bank. Can I get your phone number?
Customer: It's 555-0192.
Agent_Tool: fetch_account_info | {{"phone_number": "555-0192"}}
System: {{"account_id": "ACC-778", "name": "Jane Doe"}}
Agent_Speak: Thanks Jane. Could you verify your 4-digit security PIN?
Customer: It is 1234.
Agent_Tool: verify_kyc | {{"account_id": "ACC-778", "security_pin": "1234"}}
System: {{"kyc_status": "passed"}}
Agent_Speak: Thank you. How can I help you today?
Now, write a successful 6-12 turn transcript for the intent: {intent}.
Start with "System: Call connected."
"""
response = model.generate_content(
prompt,
generation_config=genai.types.GenerationConfig(
temperature=0.6,
max_output_tokens=1500,
)
)
return response.text
def parse_transcript_to_sharegpt(transcript: str) -> dict:
"""Converts the text transcript into the JSON format needed for Unsloth SFT."""
conversation = []
clean_transcript = transcript.replace("```text", "").replace("```", "").strip()
lines = clean_transcript.split('\n')
current_user_msg = ""
for line in lines:
line = line.strip()
if not line: continue
if line.startswith("Customer:") or line.startswith("System:"):
current_user_msg += line + "\n"
elif line.startswith("Agent_Speak:"):
if current_user_msg:
conversation.append({"role": "user", "content": current_user_msg.strip()})
current_user_msg = ""
content = line.replace("Agent_Speak:", "").strip()
action_json = {
"action_type": "speak",
"content": content,
"tool_args": {}
}
conversation.append({"role": "assistant", "content": json.dumps(action_json)})
elif line.startswith("Agent_Tool:"):
if current_user_msg:
conversation.append({"role": "user", "content": current_user_msg.strip()})
current_user_msg = ""
parts = line.replace("Agent_Tool:", "").split("|")
tool_name = parts[0].strip()
# Safely parse JSON arguments
try:
tool_args = json.loads(parts[1].strip()) if len(parts) > 1 else {}
except json.JSONDecodeError:
tool_args = {}
action_json = {
"action_type": "tool_call",
"content": tool_name,
"tool_args": tool_args
}
conversation.append({"role": "assistant", "content": json.dumps(action_json)})
if current_user_msg:
conversation.append({"role": "user", "content": current_user_msg.strip()})
return {"conversations": conversation}
def build_dataset(num_samples: int = 50):
dataset = []
print(f"Generating {num_samples} synthetic trajectories with Gemini. This will be fast...")
for i in range(num_samples):
intent = random.choice(INTENTS)
print(f"[{i+1}/{num_samples}] Generating: {intent}")
try:
raw_text = generate_raw_transcript(intent)
sharegpt_format = parse_transcript_to_sharegpt(raw_text)
if len(sharegpt_format["conversations"]) > 2:
dataset.append(sharegpt_format)
except Exception as e:
print(f"Skipping failed generation: {e}")
with open("sft_data.json", "w") as f:
json.dump(dataset, f, indent=2)
print(f"✅ Successfully saved {len(dataset)} examples to sft_data.json")
if __name__ == "__main__":
# 50 to 100 is plenty for Unsloth to learn the JSON format!
build_dataset(num_samples=100)