File size: 3,287 Bytes
a516256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import sys
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline

def train_baseline():
    """
    Trains a TF-IDF + CalibratedClassifierCV(LogisticRegression) pipeline
    and saves it as models/ticket_classifier/sklearn_router.pkl.
    """
    print("Training TF-IDF + Logistic Regression baseline pipeline...")
    
    # Ensure working directory is project root
    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    os.chdir(project_root)
    
    data_path = os.path.join("data", "raw", "support_tickets.csv")
    
    # Generate some fallback synthetic data if CSV is not present
    if os.path.exists(data_path):
        print(f"Loading data from {data_path}...")
        try:
            df = pd.read_csv(data_path)
            # Assuming columns 'text' and 'category' exist
            if 'text' in df.columns and 'category' in df.columns:
                texts = df['text'].dropna().astype(str).tolist()
                labels = df['category'].dropna().astype(str).tolist()
            else:
                raise ValueError("CSV missing 'text' or 'category' columns.")
        except Exception as e:
            print(f"Error reading CSV: {e}. Falling back to synthetic data.")
            texts, labels = get_synthetic_data()
    else:
        print(f"{data_path} not found. Generating synthetic baseline data...")
        texts, labels = get_synthetic_data()

    print(f"Training on {len(texts)} samples...")
    
    # Create the pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))),
        ('clf', CalibratedClassifierCV(LogisticRegression(class_weight='balanced', max_iter=1000))),
    ])

    # Fit the pipeline
    pipeline.fit(texts, labels)

    # Save the model
    out_dir = os.path.join("models", "ticket_classifier")
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, "sklearn_router.pkl")
    
    with open(out_path, 'wb') as f:
        pickle.dump(pipeline, f)
    
    print(f"Baseline model successfully saved to {out_path}")

def get_synthetic_data():
    """Returns synthetic data for fallback training."""
    categories = ['billing', 'technical_support', 'sales', 'account_management']
    base_texts = {
        'billing': ["invoice is wrong", "charge on my card", "cancel subscription", "refund request", "pricing plan"],
        'technical_support': ["server is down", "cannot login", "getting 500 error", "bug in the app", "export failing"],
        'sales': ["want to upgrade", "talk to sales", "enterprise pricing", "demo request", "more seats"],
        'account_management': ["change password", "update email", "delete account", "add user", "role permissions"]
    }
    
    texts = []
    labels = []
    
    # Create a reasonably sized synthetic dataset
    for cat in categories:
        for _ in range(50):
            for text in base_texts[cat]:
                texts.append(text)
                labels.append(cat)
                
    return texts, labels

if __name__ == "__main__":
    train_baseline()