Spaces:
Running
Running
| import os | |
| import sys | |
| import pickle | |
| import pandas as pd | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.calibration import CalibratedClassifierCV | |
| from sklearn.pipeline import Pipeline | |
| def train_baseline(): | |
| """ | |
| Trains a TF-IDF + CalibratedClassifierCV(LogisticRegression) pipeline | |
| and saves it as models/ticket_classifier/sklearn_router.pkl. | |
| """ | |
| print("Training TF-IDF + Logistic Regression baseline pipeline...") | |
| # Ensure working directory is project root | |
| project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| os.chdir(project_root) | |
| data_path = os.path.join("data", "raw", "support_tickets.csv") | |
| # Generate some fallback synthetic data if CSV is not present | |
| if os.path.exists(data_path): | |
| print(f"Loading data from {data_path}...") | |
| try: | |
| df = pd.read_csv(data_path) | |
| # Assuming columns 'text' and 'category' exist | |
| if 'text' in df.columns and 'category' in df.columns: | |
| texts = df['text'].dropna().astype(str).tolist() | |
| labels = df['category'].dropna().astype(str).tolist() | |
| else: | |
| raise ValueError("CSV missing 'text' or 'category' columns.") | |
| except Exception as e: | |
| print(f"Error reading CSV: {e}. Falling back to synthetic data.") | |
| texts, labels = get_synthetic_data() | |
| else: | |
| print(f"{data_path} not found. Generating synthetic baseline data...") | |
| texts, labels = get_synthetic_data() | |
| print(f"Training on {len(texts)} samples...") | |
| # Create the pipeline | |
| pipeline = Pipeline([ | |
| ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))), | |
| ('clf', CalibratedClassifierCV(LogisticRegression(class_weight='balanced', max_iter=1000))), | |
| ]) | |
| # Fit the pipeline | |
| pipeline.fit(texts, labels) | |
| # Save the model | |
| out_dir = os.path.join("models", "ticket_classifier") | |
| os.makedirs(out_dir, exist_ok=True) | |
| out_path = os.path.join(out_dir, "sklearn_router.pkl") | |
| with open(out_path, 'wb') as f: | |
| pickle.dump(pipeline, f) | |
| print(f"Baseline model successfully saved to {out_path}") | |
| def get_synthetic_data(): | |
| """Returns synthetic data for fallback training.""" | |
| categories = ['billing', 'technical_support', 'sales', 'account_management'] | |
| base_texts = { | |
| 'billing': ["invoice is wrong", "charge on my card", "cancel subscription", "refund request", "pricing plan"], | |
| 'technical_support': ["server is down", "cannot login", "getting 500 error", "bug in the app", "export failing"], | |
| 'sales': ["want to upgrade", "talk to sales", "enterprise pricing", "demo request", "more seats"], | |
| 'account_management': ["change password", "update email", "delete account", "add user", "role permissions"] | |
| } | |
| texts = [] | |
| labels = [] | |
| # Create a reasonably sized synthetic dataset | |
| for cat in categories: | |
| for _ in range(50): | |
| for text in base_texts[cat]: | |
| texts.append(text) | |
| labels.append(cat) | |
| return texts, labels | |
| if __name__ == "__main__": | |
| train_baseline() | |