Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import random | |
| import google.generativeai as genai | |
| # 1. Configure Gemini API | |
| GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "") | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| model = genai.GenerativeModel('gemini-2.5-flash') | |
| # 2. Define the Intents | |
| INTENTS = [ | |
| "dispute_charge: $50 at CoffeeCloud", | |
| "travel_notice: going to Japan", | |
| "card_replacement: lost at gym", | |
| "check_balance: current checking account", | |
| "increase_limit: needs $5000 for wedding", | |
| "reset_password: locked out of app", | |
| "stealth_dispute: Customer sees a $215.50 charge from 'TechStore Online', but they still possess their physical card. They suspect details were stolen online.", | |
| "urgent_freeze: Customer lost their wallet on the subway 10 minutes ago. They are panicking and need all cards frozen immediately.", | |
| "app_glitch: Customer is locked out of the mobile app because it keeps crashing on startup. Their account is actually fine, but they think they were hacked.", | |
| "fee_negotiation: Customer is furious about a $35 overdraft fee because their direct deposit was 1 day late. They are threatening to close the account if it isn't waived.", | |
| "out_of_bounds: Customer wants to negotiate a lower interest rate on their mortgage because a competitor offered them 6.2%. The AI is not authorized to do this.", | |
| "bounced_rent: Customer tried to wire $1,200 to their landlord for rent, but the recipient claims they never got it. Customer is extremely stressed about eviction." | |
| ] | |
| def generate_raw_transcript(intent: str) -> str: | |
| """Forces Gemini to write a perfect, successful support call using complex tools.""" | |
| prompt = f"""Write a highly realistic, multi-turn call center transcript for a banking customer with this intent: {intent}. | |
| RULES: | |
| 1. Format for Agent speaking: Agent_Speak: <text> | |
| 2. Format for Agent tool use: Agent_Tool: <tool_name> | <json_args> | |
| 3. Format for Customer: Customer: <text> | |
| 4. Format for System: System: <text> (Show the mock JSON result of a tool call) | |
| 5. THE KYC BOTTLENECK: The Agent MUST ask for a phone number to use `fetch_account_info`, AND THEN ask for a security PIN to use `verify_kyc` BEFORE making any account changes or revealing sensitive data! | |
| 6. Do NOT solve the issue in one turn. The Agent must probe and investigate. | |
| 7. DO NOT use markdown code blocks. Return ONLY raw text. | |
| AVAILABLE TOOLS: | |
| - fetch_account_info | {{"phone_number": "string"}} | |
| - verify_kyc | {{"account_id": "string", "security_pin": "string"}} | |
| - query_transactions | {{"account_id": "string", "days_back": "integer"}} | |
| - execute_account_action | {{"account_id": "string", "action_type": "string", "amount": "float", "notes": "string"}} | |
| - update_card_status | {{"account_id": "string", "status": "string"}} | |
| - escalate_to_human | {{"department": "string", "summary": "string"}} | |
| EXAMPLE WORKFLOW: | |
| Agent_Speak: Welcome to the bank. Can I get your phone number? | |
| Customer: It's 555-0192. | |
| Agent_Tool: fetch_account_info | {{"phone_number": "555-0192"}} | |
| System: {{"account_id": "ACC-778", "name": "Jane Doe"}} | |
| Agent_Speak: Thanks Jane. Could you verify your 4-digit security PIN? | |
| Customer: It is 1234. | |
| Agent_Tool: verify_kyc | {{"account_id": "ACC-778", "security_pin": "1234"}} | |
| System: {{"kyc_status": "passed"}} | |
| Agent_Speak: Thank you. How can I help you today? | |
| Now, write a successful 6-12 turn transcript for the intent: {intent}. | |
| Start with "System: Call connected." | |
| """ | |
| response = model.generate_content( | |
| prompt, | |
| generation_config=genai.types.GenerationConfig( | |
| temperature=0.6, | |
| max_output_tokens=1500, | |
| ) | |
| ) | |
| return response.text | |
| def parse_transcript_to_sharegpt(transcript: str) -> dict: | |
| """Converts the text transcript into the JSON format needed for Unsloth SFT.""" | |
| conversation = [] | |
| clean_transcript = transcript.replace("```text", "").replace("```", "").strip() | |
| lines = clean_transcript.split('\n') | |
| current_user_msg = "" | |
| for line in lines: | |
| line = line.strip() | |
| if not line: continue | |
| if line.startswith("Customer:") or line.startswith("System:"): | |
| current_user_msg += line + "\n" | |
| elif line.startswith("Agent_Speak:"): | |
| if current_user_msg: | |
| conversation.append({"role": "user", "content": current_user_msg.strip()}) | |
| current_user_msg = "" | |
| content = line.replace("Agent_Speak:", "").strip() | |
| action_json = { | |
| "action_type": "speak", | |
| "content": content, | |
| "tool_args": {} | |
| } | |
| conversation.append({"role": "assistant", "content": json.dumps(action_json)}) | |
| elif line.startswith("Agent_Tool:"): | |
| if current_user_msg: | |
| conversation.append({"role": "user", "content": current_user_msg.strip()}) | |
| current_user_msg = "" | |
| parts = line.replace("Agent_Tool:", "").split("|") | |
| tool_name = parts[0].strip() | |
| # Safely parse JSON arguments | |
| try: | |
| tool_args = json.loads(parts[1].strip()) if len(parts) > 1 else {} | |
| except json.JSONDecodeError: | |
| tool_args = {} | |
| action_json = { | |
| "action_type": "tool_call", | |
| "content": tool_name, | |
| "tool_args": tool_args | |
| } | |
| conversation.append({"role": "assistant", "content": json.dumps(action_json)}) | |
| if current_user_msg: | |
| conversation.append({"role": "user", "content": current_user_msg.strip()}) | |
| return {"conversations": conversation} | |
| def build_dataset(num_samples: int = 50): | |
| dataset = [] | |
| print(f"Generating {num_samples} synthetic trajectories with Gemini. This will be fast...") | |
| for i in range(num_samples): | |
| intent = random.choice(INTENTS) | |
| print(f"[{i+1}/{num_samples}] Generating: {intent}") | |
| try: | |
| raw_text = generate_raw_transcript(intent) | |
| sharegpt_format = parse_transcript_to_sharegpt(raw_text) | |
| if len(sharegpt_format["conversations"]) > 2: | |
| dataset.append(sharegpt_format) | |
| except Exception as e: | |
| print(f"Skipping failed generation: {e}") | |
| with open("sft_data.json", "w") as f: | |
| json.dump(dataset, f, indent=2) | |
| print(f"✅ Successfully saved {len(dataset)} examples to sft_data.json") | |
| if __name__ == "__main__": | |
| # 50 to 100 is plenty for Unsloth to learn the JSON format! | |
| build_dataset(num_samples=100) |