|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import re |
|
|
import nltk |
|
|
from nltk.corpus import stopwords |
|
|
from nltk.stem import WordNetLemmatizer |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.metrics import classification_report |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.compose import ColumnTransformer |
|
|
import gradio as gr |
|
|
import json |
|
|
|
|
|
|
|
|
nltk.download('stopwords') |
|
|
nltk.download('wordnet') |
|
|
nltk.download('omw-1.4') |
|
|
|
|
|
|
|
|
def load_data(file_path): |
|
|
df = pd.read_excel(file_path) |
|
|
print(f"Loaded data shape: {df.shape}") |
|
|
return df |
|
|
|
|
|
def clean_text(text): |
|
|
if not isinstance(text, str): |
|
|
return "" |
|
|
return re.sub(r'[^a-z0-9\s]', '', text.lower()) |
|
|
|
|
|
def tokenize_lemmatize(text): |
|
|
lemmatizer = WordNetLemmatizer() |
|
|
stop_words = set(stopwords.words('english')) |
|
|
return [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words] |
|
|
|
|
|
def preprocess_data(df): |
|
|
df[['ticket_text','issue_type','urgency_level']] = ( |
|
|
df.groupby('product')[['ticket_text','issue_type','urgency_level']] |
|
|
.transform(lambda group: group.ffill().bfill()) |
|
|
) |
|
|
df['clean_text'] = df['ticket_text'].apply(clean_text) |
|
|
df['processed_text'] = df['clean_text'].apply(lambda x: ' '.join(tokenize_lemmatize(x))) |
|
|
return df |
|
|
|
|
|
|
|
|
|
|
|
def simple_sentiment(text): |
|
|
pos = ['good', 'great', 'excellent', 'thanks'] |
|
|
neg = ['bad', 'broken', 'late', 'error', 'issue', 'problem'] |
|
|
tokens = text.split() |
|
|
return (sum(w in pos for w in tokens) - sum(w in neg for w in tokens)) / (len(tokens) or 1) |
|
|
|
|
|
def feature_engineering(df): |
|
|
df['ticket_length'] = df['clean_text'].apply(len) |
|
|
df['word_count'] = df['clean_text'].apply(lambda x: len(x.split())) |
|
|
df['sentiment'] = df['clean_text'].apply(simple_sentiment) |
|
|
return df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_models(df): |
|
|
X = df[['processed_text', 'ticket_length', 'word_count', 'sentiment']] |
|
|
y_issue = df['issue_type'] |
|
|
y_urgency = df['urgency_level'] |
|
|
|
|
|
X_train, X_test, y_issue_train, y_issue_test, y_urgency_train, y_urgency_test = train_test_split( |
|
|
X, y_issue, y_urgency, test_size=0.2, random_state=42 |
|
|
) |
|
|
|
|
|
text_pipe = Pipeline([ |
|
|
('tfidf', TfidfVectorizer(max_features=500)) |
|
|
]) |
|
|
|
|
|
preprocessor = ColumnTransformer([ |
|
|
('text', text_pipe, 'processed_text'), |
|
|
('numeric', 'passthrough', ['ticket_length', 'word_count', 'sentiment']) |
|
|
]) |
|
|
|
|
|
issue_model = Pipeline([ |
|
|
('pre', preprocessor), |
|
|
('clf', RandomForestClassifier(n_estimators=100, random_state=42)) |
|
|
]) |
|
|
|
|
|
urgency_model = Pipeline([ |
|
|
('pre', preprocessor), |
|
|
('clf', RandomForestClassifier(n_estimators=100, random_state=42)) |
|
|
]) |
|
|
|
|
|
issue_model.fit(X_train, y_issue_train) |
|
|
urgency_model.fit(X_train, y_urgency_train) |
|
|
|
|
|
print("Issue Classification:\n", classification_report(y_issue_test, issue_model.predict(X_test))) |
|
|
print("Urgency Classification:\n", classification_report(y_urgency_test, urgency_model.predict(X_test))) |
|
|
|
|
|
return issue_model, urgency_model |
|
|
|
|
|
|
|
|
def predict_ticket(ticket_text, issue_model, urgency_model): |
|
|
cleaned = clean_text(ticket_text) |
|
|
processed = ' '.join(tokenize_lemmatize(cleaned)) |
|
|
features = pd.DataFrame([{ |
|
|
'processed_text': processed, |
|
|
'ticket_length': len(cleaned), |
|
|
'word_count': len(cleaned.split()), |
|
|
'sentiment': simple_sentiment(cleaned) |
|
|
}]) |
|
|
return issue_model.predict(features)[0], urgency_model.predict(features)[0] |
|
|
|
|
|
|
|
|
def create_gradio_interface(issue_model, urgency_model): |
|
|
def wrapped(ticket_text): |
|
|
try: |
|
|
issue, urgency = predict_ticket(ticket_text, issue_model, urgency_model) |
|
|
return issue, urgency |
|
|
except Exception as e: |
|
|
return f"Error: {e}", "" |
|
|
|
|
|
return gr.Interface( |
|
|
fn=wrapped, |
|
|
inputs=gr.Textbox(label="Ticket Text", lines=4), |
|
|
outputs=[ |
|
|
gr.Textbox(label="Predicted Issue Type"), |
|
|
gr.Textbox(label="Predicted Urgency Level") |
|
|
], |
|
|
title="Support Ticket Classifier", |
|
|
description="Enter a ticket to classify its issue type and urgency level.", |
|
|
examples=[ |
|
|
["payment issue with smartwatch v2, underbilled order 29224"], |
|
|
["Router stopped working after update, need immediate help"], |
|
|
["Received damaged headphones in shipment, request refund"], |
|
|
["ordered smartwatch v2 got protab x1 instead order number 76301"], |
|
|
["cant log account keep showing error help"], |
|
|
["tell ecobreeze ac warranty also available black"] |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
df = load_data("ai_dev_assignment_tickets_complex_1000.xls") |
|
|
df = preprocess_data(df) |
|
|
df = feature_engineering(df) |
|
|
issue_model, urgency_model = train_models(df) |
|
|
iface = create_gradio_interface(issue_model, urgency_model) |
|
|
|
|
|
|
|
|
iface.launch(share=True) |
|
|
|