Spaces:
Running
Running
File size: 6,877 Bytes
4a384ab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | # data/prepare_kaggle_data.py
# Combines Kaggle Customer Support Tickets with Synthetic Categories
import os
import csv
import random
random.seed(42)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR = os.path.dirname(BASE_DIR)
KAGGLE_CSV = os.path.join(ROOT_DIR, '..', 'customer_support_tickets.csv')
PROC_DIR = os.path.join(BASE_DIR, 'processed')
CATEGORIES = [
'billing', 'technical_support', 'account_management', 'feature_request',
'compliance_legal', 'onboarding', 'general_inquiry', 'churn_risk'
]
CATEGORY_MAP = {cat: i for i, cat in enumerate(CATEGORIES)}
# Mapping from Kaggle 'Ticket Type' to our categories
KAGGLE_MAPPING = {
'Billing inquiry': 'billing',
'Refund request': 'billing',
'Technical issue': 'technical_support',
'Cancellation request': 'churn_risk',
'Product inquiry': 'general_inquiry'
}
def load_kaggle_data():
tickets = []
if not os.path.exists(KAGGLE_CSV):
print(f"Kaggle CSV not found at {KAGGLE_CSV}")
return tickets
with open(KAGGLE_CSV, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
ttype = row.get('Ticket Type', '')
desc = row.get('Ticket Description', '').strip()
if not desc or len(desc) < 10:
continue
if ttype in KAGGLE_MAPPING:
cat = KAGGLE_MAPPING[ttype]
tickets.append({
'text': desc,
'label': CATEGORY_MAP[cat],
'category': cat,
'source': 'kaggle'
})
return tickets
def get_synthetic_data_for_missing():
# Import from the existing preprocess.py to generate the missing classes
import sys
sys.path.append(BASE_DIR)
try:
from preprocess import generate_synthetic_data
synth_all = generate_synthetic_data()
# Only keep classes that Kaggle dataset doesn't cover well
missing_cats = ['account_management', 'feature_request', 'compliance_legal', 'onboarding']
filtered = [t for t in synth_all if t['category'] in missing_cats]
for f in filtered:
f['source'] = 'synthetic'
return filtered
except ImportError:
print("Could not import preprocess.py")
return []
def extract_sla_kaggle_data():
"""Extract rows for SLA predictor training from Kaggle dataset."""
sla_data = []
if not os.path.exists(KAGGLE_CSV):
return sla_data
with open(KAGGLE_CSV, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
try:
# Text complexity score (heuristic: length / 20)
desc = row.get('Ticket Description', '')
tc = min(len(desc) / 20.0, 18.0)
if tc < 3.0: tc = 3.0
# Extract priorities and satisfaction
priority = row.get('Ticket Priority', 'Medium')
qd = 10
if priority == 'Critical': qd = 20
elif priority == 'High': qd = 15
elif priority == 'Low': qd = 5
# Tier based on age or arbitrary for demo
age = int(row.get('Customer Age', 30))
ct = 4 if age > 50 else (3 if age > 35 else (2 if age > 25 else 1))
# Sentiment (if low satisfaction, negative sentiment)
sat_rating = row.get('Customer Satisfaction Rating', '')
if sat_rating:
sat = float(sat_rating)
se = (sat - 3.0) / 2.0 # Maps 1-5 to -1.0 to 1.0
else:
se = 0.0
# Breach heuristic (if priority is Critical or High, higher chance)
status = row.get('Ticket Status', '')
breach = 1 if priority in ['Critical', 'High'] and status == 'Pending Customer Response' else 0
sla_data.append({
'text_complexity_score': round(tc, 2),
'agent_queue_depth': qd,
'customer_tier': ct,
'hour_of_day': random.randint(8, 18),
'day_of_week': random.randint(0, 4),
'similar_ticket_avg_hrs': round(random.uniform(1.0, 24.0), 2),
'sentiment_score': round(se, 2),
'repeat_issue': random.randint(0, 1),
'escalated_before': 1 if priority == 'Critical' else 0,
'sla_breached': breach
})
except ValueError:
continue
return sla_data
def main():
os.makedirs(PROC_DIR, exist_ok=True)
print("Loading Kaggle data...")
kaggle_tickets = load_kaggle_data()
print(f"Loaded {len(kaggle_tickets)} valid Kaggle tickets.")
print("Loading Synthetic data for missing categories...")
synth_tickets = get_synthetic_data_for_missing()
print(f"Loaded {len(synth_tickets)} synthetic tickets.")
all_tickets = kaggle_tickets + synth_tickets
random.shuffle(all_tickets)
# Subsample to keep training fast but effective (~5000 tickets)
if len(all_tickets) > 5000:
all_tickets = all_tickets[:5000]
print(f"Subsampled down to {len(all_tickets)} tickets for efficient training.")
print("\nCategory distribution for training:")
counts = {c: 0 for c in CATEGORIES}
for t in all_tickets:
counts[t['category']] += 1
for k, v in counts.items():
print(f" {k:20s}: {v}")
# Split 80/20
n = len(all_tickets)
train_end = int(n * 0.8)
train = all_tickets[:train_end]
val = all_tickets[train_end:]
def save_csv(data, filename):
path = os.path.join(PROC_DIR, filename)
with open(path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['text', 'label', 'category', 'source'])
writer.writeheader()
writer.writerows(data)
print(f"Saved {len(data)} rows to {filename}")
save_csv(train, 'train.csv')
save_csv(val, 'val.csv')
# Process SLA Data
print("\nProcessing Kaggle SLA data...")
sla_data = extract_sla_kaggle_data()
if len(sla_data) > 10000:
sla_data = random.sample(sla_data, 10000)
if sla_data:
sla_path = os.path.join(PROC_DIR, 'sla_train.csv')
with open(sla_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=list(sla_data[0].keys()))
writer.writeheader()
writer.writerows(sla_data)
print(f"Saved {len(sla_data)} SLA rows to sla_train.csv")
if __name__ == "__main__":
main()
|