File size: 5,481 Bytes
4833083
76d9143
4833083
 
 
 
 
 
 
 
 
 
 
76d9143
4833083
76d9143
4833083
 
 
 
fa34829
76d9143
 
 
 
4833083
 
 
 
76d9143
4833083
06cf98c
76d9143
 
 
06cf98c
76d9143
4833083
76d9143
 
4833083
 
76d9143
 
 
fa34829
 
76d9143
 
 
 
 
 
 
4833083
 
 
76d9143
4833083
fa34829
 
 
 
76d9143
4833083
 
 
06cf98c
4833083
 
 
06cf98c
76d9143
4833083
 
06cf98c
76d9143
 
 
 
06cf98c
76d9143
 
4833083
 
06cf98c
76d9143
 
8c58d07
4833083
06cf98c
76d9143
 
06cf98c
76d9143
 
4833083
76d9143
4833083
fa34829
76d9143
 
 
06cf98c
 
76d9143
 
 
4833083
76d9143
 
fa34829
76d9143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3590e1
 
 
 
 
a341e82
 
 
f3590e1
76d9143
 
fa34829
 
76d9143
 
 
 
 
 
4833083
76d9143
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import gradio as gr
import json

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# DataLoad and Preprocess
def load_data(file_path):
    df = pd.read_excel(file_path)
    print(f"Loaded data shape: {df.shape}")
    return df

def clean_text(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r'[^a-z0-9\s]', '', text.lower())

def tokenize_lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    return [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]

def preprocess_data(df):
    df[['ticket_text','issue_type','urgency_level']] = (
        df.groupby('product')[['ticket_text','issue_type','urgency_level']]
          .transform(lambda group: group.ffill().bfill())
    )
    df['clean_text'] = df['ticket_text'].apply(clean_text)
    df['processed_text'] = df['clean_text'].apply(lambda x: ' '.join(tokenize_lemmatize(x)))
    return df

# Feature Engineering
# Checking the sentimenet analysis on basees of different word like bad, good, late etc.
def simple_sentiment(text):
    pos = ['good', 'great', 'excellent', 'thanks']
    neg = ['bad', 'broken', 'late', 'error', 'issue', 'problem']
    tokens = text.split()
    return (sum(w in pos for w in tokens) - sum(w in neg for w in tokens)) / (len(tokens) or 1)

def feature_engineering(df):
    df['ticket_length'] = df['clean_text'].apply(len)
    df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))
    df['sentiment'] = df['clean_text'].apply(simple_sentiment)
    return df

# Train Models
# so here we are train the randomforest model.
# and we need to train the model as per the requirement issue_type and uregency_level
# also calculating the model performance by Classification Report
def train_models(df):
    X = df[['processed_text', 'ticket_length', 'word_count', 'sentiment']]
    y_issue = df['issue_type']
    y_urgency = df['urgency_level']

    X_train, X_test, y_issue_train, y_issue_test, y_urgency_train, y_urgency_test = train_test_split(
        X, y_issue, y_urgency, test_size=0.2, random_state=42
    )

    text_pipe = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=500))
    ])

    preprocessor = ColumnTransformer([
        ('text', text_pipe, 'processed_text'),
        ('numeric', 'passthrough', ['ticket_length', 'word_count', 'sentiment'])
    ])

    issue_model = Pipeline([
        ('pre', preprocessor),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

    urgency_model = Pipeline([
        ('pre', preprocessor),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

    issue_model.fit(X_train, y_issue_train)
    urgency_model.fit(X_train, y_urgency_train)

    print("Issue Classification:\n", classification_report(y_issue_test, issue_model.predict(X_test)))
    print("Urgency Classification:\n", classification_report(y_urgency_test, urgency_model.predict(X_test)))

    return issue_model, urgency_model

# Predict Single Ticket
def predict_ticket(ticket_text, issue_model, urgency_model):
    cleaned = clean_text(ticket_text)
    processed = ' '.join(tokenize_lemmatize(cleaned))
    features = pd.DataFrame([{
        'processed_text': processed,
        'ticket_length': len(cleaned),
        'word_count': len(cleaned.split()),
        'sentiment': simple_sentiment(cleaned)
    }])
    return issue_model.predict(features)[0], urgency_model.predict(features)[0]

# Generating the Gradio Interface  as per task
def create_gradio_interface(issue_model, urgency_model):
    def wrapped(ticket_text):
        try:
            issue, urgency = predict_ticket(ticket_text, issue_model, urgency_model)
            return issue, urgency
        except Exception as e:
            return f"Error: {e}", ""

    return gr.Interface(
        fn=wrapped,
        inputs=gr.Textbox(label="Ticket Text", lines=4),
        outputs=[
            gr.Textbox(label="Predicted Issue Type"),
            gr.Textbox(label="Predicted Urgency Level")
        ],
        title="Support Ticket Classifier",
        description="Enter a ticket to classify its issue type and urgency level.",
        examples=[
            ["payment issue with smartwatch v2, underbilled order 29224"],
            ["Router stopped working after update, need immediate help"],
            ["Received damaged headphones in shipment, request refund"],
            ["ordered smartwatch v2 got protab x1 instead order number 76301"],
            ["cant log account keep showing error help"],
            ["tell ecobreeze ac warranty also available black"]
        ]
    )



if __name__ == "__main__":
    df = load_data("ai_dev_assignment_tickets_complex_1000.xls")
    df = preprocess_data(df)
    df = feature_engineering(df)
    issue_model, urgency_model = train_models(df)
    iface = create_gradio_interface(issue_model, urgency_model)

    #Deploy to public Gradio space (with temporary link)
    iface.launch(share=True)