ashish-sarvam's picture
Upload folder using huggingface_hub
fc1a684 verified
from __future__ import annotations
from collections import defaultdict
import csv
import json
from pathlib import Path
from typing import Dict, List, Optional, Any, Tuple
from conv_data_gen.logger import setup_logger
logger = setup_logger(__name__)
def append_kv(parts: List[str], key: str, value: str, desc: str = "") -> None:
if desc:
parts.append(f"- {key}: {value}{desc}")
else:
parts.append(f"- {key}: {value}")
def compose_user_message_for_proxy(
user_goal: str,
user_personality: str,
meta_data: Any,
user_knobs_dict: Any,
available_tools: Optional[List[Dict[str, Any]]] = None,
available_knowledge_bases: Optional[List[Dict[str, str]]] = None,
agent_variables: Optional[Dict[str, Any]] = None,
) -> str:
parts: List[str] = []
# DEFINING USER #
parts.append("## ABOUT THE USER ##")
parts.append(f"USER ROLE: {meta_data.get('user_type', '')}")
parts.append(f"USER DESCRIPTION: {user_personality}")
parts.append(f"USER GOAL: {user_goal}")
# DEFINING TASK#
parts.append("## ABOUT THE TASK ##")
parts.append(f"COMPANY: {meta_data.get('company', '')}")
parts.append(f"USE CASE: {meta_data.get('use_case', '')}")
parts.append(
f"TYPE OF PERSON YOU WILL BE TALKING TO: {meta_data.get('agent_type', '')}" # noqa
)
parts.append(
f"DIRECTION OF THE CONVERSATION: {meta_data.get('conversation_direction', '')}" # noqa
)
# DEFINING USER KNOBS #
parts.append("\nKNOBS:")
knobs = user_knobs_dict.get("knobs", {})
knob_descriptions = user_knobs_dict.get("knob_descriptions", {})
for k, v in knobs.items():
append_kv(parts, k, v, knob_descriptions.get(k, ""))
parts.append("\nLANGUAGE:")
lang = user_knobs_dict.get("language_style", {})
ldesc = user_knobs_dict.get("language_descriptions", {})
append_kv(
parts,
"language",
lang.get("language", ""),
ldesc.get("language_desc", ""),
)
append_kv(
parts,
"formality",
lang.get("formality", ""),
ldesc.get("formality_desc", ""),
)
append_kv(
parts,
"code_switch_ratio",
lang.get("code_switch_ratio", ""),
ldesc.get("code_switch_desc", ""),
)
parts.append(" - regionalisms: " + lang.get("regionalisms", ""))
parts.append("\nDEMOGRAPHICS:")
demographics = user_knobs_dict.get("demographics", {})
demographic_descriptions = user_knobs_dict.get(
"demographic_descriptions", {}
)
for k, v in demographics.items():
append_kv(parts, k, v, demographic_descriptions.get(k, ""))
# Provide interaction complexity tier information
parts.append("\nINTERACTION (complexity tier constraints):")
interaction = user_knobs_dict.get("interaction", {})
if interaction.get("tier_name"):
parts.append(f"- tier_name: {interaction.get('tier_name', '')}")
parts.append(
f"- turn_range: ["
f"{interaction.get('turn_min', '')}, "
f"{interaction.get('turn_max', '')}]"
)
parts.append(
f"- tool_calls_budget: ["
f"{interaction.get('tool_calls_min', '')}, "
f"{interaction.get('tool_calls_max', '')}]"
)
parts.append(
f"- kb_queries_budget: ["
f"{interaction.get('kb_queries_min', '')}, "
f"{interaction.get('kb_queries_max', '')}]"
)
# DEFINING AGENT DETAILS #
parts.append(f"YOUR VARIABLES AVAILABLE TO THE AGENT: {agent_variables}")
parts.append(
f"PROMPT OF THE AGENT YOU WILL BE TALKING TO: {meta_data.get('bot_prompt', '')}" # noqa
)
if available_tools:
parts.append("\nAVAILABLE_TOOLS:")
for tool in available_tools:
tool_name = tool.get("name", "unknown_tool")
tool_desc = tool.get("description", "No description available")
parts.append(f"- {tool_name}: {tool_desc}")
# Add available knowledge bases information
if available_knowledge_bases:
parts.append("\nAVAILABLE_KNOWLEDGE_BASES:")
for kb in available_knowledge_bases:
kb_name = kb.get("name", "unknown_kb")
kb_desc = kb.get("description", "No description available")
parts.append(f"- {kb_name}: {kb_desc}")
msg = "\n".join(parts)
return msg
def save_persona_text(base_dir: Path, text: str, index: int) -> str:
out_path = base_dir / f"persona_{index}.txt"
out_path.write_text(text, encoding="utf-8")
logger.info("Saved persona to %s", out_path)
return str(out_path)
def append_persona_csv(csv_path: Path, rows: List[Dict[str, str]]) -> str:
write_header = not csv_path.exists()
fieldnames = sorted({k for row in rows for k in row.keys()})
with open(csv_path, "a", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
if write_header:
writer.writeheader()
for row in rows:
writer.writerow(row)
logger.info("Appended personas CSV at %s", csv_path)
return str(csv_path)
def read_proxy_rows(path: str) -> List[Dict[str, Any]]:
p = Path(path)
items: List[Dict[str, Any]] = []
with open(p, "r", encoding="utf-8") as f:
for line in f:
try:
obj = json.loads(line)
if isinstance(obj, dict):
items.append({str(k): v for k, v in obj.items()})
except Exception:
continue
return items
def group_by_key(
rows: List[Dict[str, Any]],
) -> Dict[Tuple[str, str, str], List[Dict[str, Any]]]:
groups: Dict[Tuple[str, str, str], List[Dict[str, Any]]] = defaultdict(
list
)
for r in rows:
company = str(r.get("company", ""))
agent_type = str(r.get("agent_type", ""))
use_case = str(r.get("use_case", ""))
groups[(company, agent_type, use_case)].append(r)
return groups
def select_rows_for_personas(
groups: Dict[Tuple[str, str, str], List[Dict[str, Any]]],
per_group: int,
) -> List[Dict[str, Any]]:
per = max(1, int(per_group))
selected: List[Dict[str, Any]] = []
for _, items in groups.items():
selected.extend(items[:per])
return selected
def extract_flat_fields(
spec: Dict[str, Any],
) -> Tuple[str, str, Dict[str, Any]]:
user_desc = str(spec.get("user_description", ""))
goal_in_conv = str(spec.get("goal_in_conversation", ""))
bft = spec.get("big_five_traits", {})
big_five_traits: Dict[str, Any] = bft if isinstance(bft, dict) else {}
return user_desc, goal_in_conv, big_five_traits
def extract_user_goal(spec: Dict[str, Any]) -> str:
return str(spec.get("goal_in_conversation", ""))
def extract_user_personality_description(spec: Dict[str, Any]) -> str:
return str(spec.get("user_description", ""))
def extract_user_knobs(spec: Dict[str, Any]) -> str:
return str(spec.get("knobs", ""))
def extract_meta_data(spec: Dict[str, Any]) -> str:
meta_data = {
"company": spec.get("company", ""),
"use_case": spec.get("use_case", ""),
"conversation_direction": spec.get("conversation_direction", ""),
"agent_type": spec.get("agent_type", ""),
"user_type": spec.get("user_type", ""),
"bot_prompt": spec.get("bot_prompt", ""),
}
return meta_data
def user_knobs_to_dict(user_knobs: Any) -> Dict[str, Any]:
sample_dict = {
"knobs": user_knobs.knobs,
"knob_descriptions": user_knobs.knob_descriptions,
"language_style": user_knobs.language_style,
"language_descriptions": user_knobs.language_descriptions,
"demographics": user_knobs.demographics,
"demographic_descriptions": user_knobs.demographic_descriptions,
"interaction": user_knobs.interaction,
}
return sample_dict