File size: 5,481 Bytes
4833083 76d9143 4833083 76d9143 4833083 76d9143 4833083 fa34829 76d9143 4833083 76d9143 4833083 06cf98c 76d9143 06cf98c 76d9143 4833083 76d9143 4833083 76d9143 fa34829 76d9143 4833083 76d9143 4833083 fa34829 76d9143 4833083 06cf98c 4833083 06cf98c 76d9143 4833083 06cf98c 76d9143 06cf98c 76d9143 4833083 06cf98c 76d9143 8c58d07 4833083 06cf98c 76d9143 06cf98c 76d9143 4833083 76d9143 4833083 fa34829 76d9143 06cf98c 76d9143 4833083 76d9143 fa34829 76d9143 f3590e1 a341e82 f3590e1 76d9143 fa34829 76d9143 4833083 76d9143 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import gradio as gr
import json
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# DataLoad and Preprocess
def load_data(file_path):
df = pd.read_excel(file_path)
print(f"Loaded data shape: {df.shape}")
return df
def clean_text(text):
if not isinstance(text, str):
return ""
return re.sub(r'[^a-z0-9\s]', '', text.lower())
def tokenize_lemmatize(text):
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
return [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
def preprocess_data(df):
df[['ticket_text','issue_type','urgency_level']] = (
df.groupby('product')[['ticket_text','issue_type','urgency_level']]
.transform(lambda group: group.ffill().bfill())
)
df['clean_text'] = df['ticket_text'].apply(clean_text)
df['processed_text'] = df['clean_text'].apply(lambda x: ' '.join(tokenize_lemmatize(x)))
return df
# Feature Engineering
# Checking the sentimenet analysis on basees of different word like bad, good, late etc.
def simple_sentiment(text):
pos = ['good', 'great', 'excellent', 'thanks']
neg = ['bad', 'broken', 'late', 'error', 'issue', 'problem']
tokens = text.split()
return (sum(w in pos for w in tokens) - sum(w in neg for w in tokens)) / (len(tokens) or 1)
def feature_engineering(df):
df['ticket_length'] = df['clean_text'].apply(len)
df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))
df['sentiment'] = df['clean_text'].apply(simple_sentiment)
return df
# Train Models
# so here we are train the randomforest model.
# and we need to train the model as per the requirement issue_type and uregency_level
# also calculating the model performance by Classification Report
def train_models(df):
X = df[['processed_text', 'ticket_length', 'word_count', 'sentiment']]
y_issue = df['issue_type']
y_urgency = df['urgency_level']
X_train, X_test, y_issue_train, y_issue_test, y_urgency_train, y_urgency_test = train_test_split(
X, y_issue, y_urgency, test_size=0.2, random_state=42
)
text_pipe = Pipeline([
('tfidf', TfidfVectorizer(max_features=500))
])
preprocessor = ColumnTransformer([
('text', text_pipe, 'processed_text'),
('numeric', 'passthrough', ['ticket_length', 'word_count', 'sentiment'])
])
issue_model = Pipeline([
('pre', preprocessor),
('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])
urgency_model = Pipeline([
('pre', preprocessor),
('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])
issue_model.fit(X_train, y_issue_train)
urgency_model.fit(X_train, y_urgency_train)
print("Issue Classification:\n", classification_report(y_issue_test, issue_model.predict(X_test)))
print("Urgency Classification:\n", classification_report(y_urgency_test, urgency_model.predict(X_test)))
return issue_model, urgency_model
# Predict Single Ticket
def predict_ticket(ticket_text, issue_model, urgency_model):
cleaned = clean_text(ticket_text)
processed = ' '.join(tokenize_lemmatize(cleaned))
features = pd.DataFrame([{
'processed_text': processed,
'ticket_length': len(cleaned),
'word_count': len(cleaned.split()),
'sentiment': simple_sentiment(cleaned)
}])
return issue_model.predict(features)[0], urgency_model.predict(features)[0]
# Generating the Gradio Interface as per task
def create_gradio_interface(issue_model, urgency_model):
def wrapped(ticket_text):
try:
issue, urgency = predict_ticket(ticket_text, issue_model, urgency_model)
return issue, urgency
except Exception as e:
return f"Error: {e}", ""
return gr.Interface(
fn=wrapped,
inputs=gr.Textbox(label="Ticket Text", lines=4),
outputs=[
gr.Textbox(label="Predicted Issue Type"),
gr.Textbox(label="Predicted Urgency Level")
],
title="Support Ticket Classifier",
description="Enter a ticket to classify its issue type and urgency level.",
examples=[
["payment issue with smartwatch v2, underbilled order 29224"],
["Router stopped working after update, need immediate help"],
["Received damaged headphones in shipment, request refund"],
["ordered smartwatch v2 got protab x1 instead order number 76301"],
["cant log account keep showing error help"],
["tell ecobreeze ac warranty also available black"]
]
)
if __name__ == "__main__":
df = load_data("ai_dev_assignment_tickets_complex_1000.xls")
df = preprocess_data(df)
df = feature_engineering(df)
issue_model, urgency_model = train_models(df)
iface = create_gradio_interface(issue_model, urgency_model)
#Deploy to public Gradio space (with temporary link)
iface.launch(share=True)
|