jiekarl commited on
Commit
76d9143
·
verified ·
1 Parent(s): 06cf98c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -87
app.py CHANGED
@@ -1,4 +1,7 @@
 
 
1
  import pandas as pd
 
2
  import re
3
  import nltk
4
  from nltk.corpus import stopwords
@@ -10,67 +13,53 @@ from sklearn.metrics import classification_report
10
  from sklearn.pipeline import Pipeline
11
  from sklearn.compose import ColumnTransformer
12
  import gradio as gr
 
13
 
 
14
  nltk.download('stopwords')
15
  nltk.download('wordnet')
16
  nltk.download('omw-1.4')
17
 
18
- lemmatizer = WordNetLemmatizer()
19
- stop_words = set(stopwords.words('english'))
 
 
 
20
 
21
  def clean_text(text):
22
  if not isinstance(text, str):
23
  return ""
24
- text = text.lower()
25
- text = re.sub(r'[^a-z0-9\s]', '', text)
26
- return text
27
 
28
  def tokenize_lemmatize(text):
29
- tokens = text.split()
30
- return [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
 
31
 
32
- def simple_sentiment(text):
33
- positive = ['good', 'great', 'excellent', 'thanks']
34
- negative = ['bad', 'broken', 'late', 'error', 'issue', 'problem']
35
- tokens = text.split()
36
- pos_count = sum(1 for word in tokens if word in positive)
37
- neg_count = sum(1 for word in tokens if word in negative)
38
- return (pos_count - neg_count) / len(tokens) if tokens else 0
39
-
40
- def extract_entities(text):
41
- entities = {"product": [], "dates": [], "complaint_keywords": []}
42
- product_list = ['phone', 'tablet', 'laptop', 'router', 'monitor', 'printer']
43
- for product in product_list:
44
- if re.search(rf"\\b{product}\\b", text, re.IGNORECASE):
45
- entities["product"].append(product)
46
-
47
- date_patterns = [
48
- r'\d{1,2}/\d{1,2}/\d{2,4}',
49
- r'\d{1,2}-\d{1,2}-\d{2,4}',
50
- r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* \d{1,2},? \d{4}\b'
51
- ]
52
- for pattern in date_patterns:
53
- entities["dates"].extend(re.findall(pattern, text))
54
-
55
- complaint_words = ['broken', 'damage', 'late', 'delay', 'error', 'fault',
56
- 'defect', 'issue', 'problem', 'not working', 'failed']
57
- for word in complaint_words:
58
- if re.search(rf"\\b{word}\\b", text, re.IGNORECASE):
59
- entities["complaint_keywords"].append(word)
60
- return entities
61
-
62
- def load_and_train():
63
- df = pd.read_excel("ai_dev_assignment_tickets_complex_1000.xls")
64
  df[['ticket_text','issue_type','urgency_level']] = (
65
- df.groupby('product')[['ticket_text', 'issue_type','urgency_level']]
66
- .transform(lambda group: group.ffill().bfill())
67
  )
68
  df['clean_text'] = df['ticket_text'].apply(clean_text)
69
- df['processed_text'] = df['clean_text'].apply(tokenize_lemmatize).apply(' '.join)
 
 
 
 
 
 
 
 
 
 
70
  df['ticket_length'] = df['clean_text'].apply(len)
71
  df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))
72
  df['sentiment'] = df['clean_text'].apply(simple_sentiment)
 
73
 
 
 
74
  X = df[['processed_text', 'ticket_length', 'word_count', 'sentiment']]
75
  y_issue = df['issue_type']
76
  y_urgency = df['urgency_level']
@@ -79,65 +68,72 @@ def load_and_train():
79
  X, y_issue, y_urgency, test_size=0.2, random_state=42
80
  )
81
 
82
- text_transformer = Pipeline([
83
  ('tfidf', TfidfVectorizer(max_features=500))
84
  ])
85
 
86
- preprocessor = ColumnTransformer(
87
- transformers=[
88
- ('text', text_transformer, 'processed_text'),
89
- ('num', 'passthrough', ['ticket_length', 'word_count', 'sentiment'])
90
- ])
91
 
92
- issue_pipe = Pipeline([
93
- ('preprocessor', preprocessor),
94
  ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
95
  ])
96
 
97
- urgency_pipe = Pipeline([
98
- ('preprocessor', preprocessor),
99
  ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
100
  ])
101
 
102
- issue_pipe.fit(X_train, y_issue_train)
103
- urgency_pipe.fit(X_train, y_urgency_train)
104
 
105
- return issue_pipe, urgency_pipe
 
106
 
107
- # Train the models at startup
108
- issue_model, urgency_model = load_and_train()
109
 
110
- def gradio_interface(ticket_text):
111
- clean = clean_text(ticket_text)
112
- tokens = tokenize_lemmatize(clean)
113
- processed = ' '.join(tokens)
114
  features = pd.DataFrame([{
115
  'processed_text': processed,
116
- 'ticket_length': len(clean),
117
- 'word_count': len(clean.split()),
118
- 'sentiment': simple_sentiment(clean)
119
  }])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- issue_pred = issue_model.predict(features)[0]
122
- urgency_pred = urgency_model.predict(features)[0]
123
- entities = extract_entities(ticket_text)
124
-
125
- return {
126
- "Predicted Issue Type": issue_pred,
127
- "Predicted Urgency Level": urgency_pred,
128
- "Extracted Entities": entities
129
- }
130
-
131
- iface = gr.Interface(
132
- fn=gradio_interface,
133
- inputs=gr.Textbox(lines=5, placeholder="Enter ticket text here..."),
134
- outputs=[
135
- gr.Textbox(label="Predicted Issue Type"),
136
- gr.Textbox(label="Predicted Urgency Level"),
137
- gr.JSON(label="Extracted Entities")
138
- ],
139
- title="Support Ticket Classifier",
140
- description="Classifies support ticket issue type and urgency, and extracts key entities."
141
- )
142
-
143
- iface.launch()
 
1
+ # ticket_classifier.py
2
+
3
  import pandas as pd
4
+ import numpy as np
5
  import re
6
  import nltk
7
  from nltk.corpus import stopwords
 
13
  from sklearn.pipeline import Pipeline
14
  from sklearn.compose import ColumnTransformer
15
  import gradio as gr
16
+ import json
17
 
18
+ # Download NLTK resources
19
  nltk.download('stopwords')
20
  nltk.download('wordnet')
21
  nltk.download('omw-1.4')
22
 
23
+ # -------------------- 1. Load and Preprocess --------------------
24
+ def load_data(file_path):
25
+ df = pd.read_excel(file_path)
26
+ print(f"Loaded data shape: {df.shape}")
27
+ return df
28
 
29
  def clean_text(text):
30
  if not isinstance(text, str):
31
  return ""
32
+ return re.sub(r'[^a-z0-9\s]', '', text.lower())
 
 
33
 
34
  def tokenize_lemmatize(text):
35
+ lemmatizer = WordNetLemmatizer()
36
+ stop_words = set(stopwords.words('english'))
37
+ return [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
38
 
39
+ def preprocess_data(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  df[['ticket_text','issue_type','urgency_level']] = (
41
+ df.groupby('product')[['ticket_text','issue_type','urgency_level']]
42
+ .transform(lambda group: group.ffill().bfill())
43
  )
44
  df['clean_text'] = df['ticket_text'].apply(clean_text)
45
+ df['processed_text'] = df['clean_text'].apply(lambda x: ' '.join(tokenize_lemmatize(x)))
46
+ return df
47
+
48
+ # -------------------- 2. Feature Engineering --------------------
49
+ def simple_sentiment(text):
50
+ pos = ['good', 'great', 'excellent', 'thanks']
51
+ neg = ['bad', 'broken', 'late', 'error', 'issue', 'problem']
52
+ tokens = text.split()
53
+ return (sum(w in pos for w in tokens) - sum(w in neg for w in tokens)) / (len(tokens) or 1)
54
+
55
+ def feature_engineering(df):
56
  df['ticket_length'] = df['clean_text'].apply(len)
57
  df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))
58
  df['sentiment'] = df['clean_text'].apply(simple_sentiment)
59
+ return df
60
 
61
+ # -------------------- 3. Train Models --------------------
62
+ def train_models(df):
63
  X = df[['processed_text', 'ticket_length', 'word_count', 'sentiment']]
64
  y_issue = df['issue_type']
65
  y_urgency = df['urgency_level']
 
68
  X, y_issue, y_urgency, test_size=0.2, random_state=42
69
  )
70
 
71
+ text_pipe = Pipeline([
72
  ('tfidf', TfidfVectorizer(max_features=500))
73
  ])
74
 
75
+ preprocessor = ColumnTransformer([
76
+ ('text', text_pipe, 'processed_text'),
77
+ ('numeric', 'passthrough', ['ticket_length', 'word_count', 'sentiment'])
78
+ ])
 
79
 
80
+ issue_model = Pipeline([
81
+ ('pre', preprocessor),
82
  ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
83
  ])
84
 
85
+ urgency_model = Pipeline([
86
+ ('pre', preprocessor),
87
  ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
88
  ])
89
 
90
+ issue_model.fit(X_train, y_issue_train)
91
+ urgency_model.fit(X_train, y_urgency_train)
92
 
93
+ print("Issue Classification:\n", classification_report(y_issue_test, issue_model.predict(X_test)))
94
+ print("Urgency Classification:\n", classification_report(y_urgency_test, urgency_model.predict(X_test)))
95
 
96
+ return issue_model, urgency_model
 
97
 
98
+ # -------------------- 4. Predict Single Ticket --------------------
99
+ def predict_ticket(ticket_text, issue_model, urgency_model):
100
+ cleaned = clean_text(ticket_text)
101
+ processed = ' '.join(tokenize_lemmatize(cleaned))
102
  features = pd.DataFrame([{
103
  'processed_text': processed,
104
+ 'ticket_length': len(cleaned),
105
+ 'word_count': len(cleaned.split()),
106
+ 'sentiment': simple_sentiment(cleaned)
107
  }])
108
+ return issue_model.predict(features)[0], urgency_model.predict(features)[0]
109
+
110
+ # -------------------- 5. Gradio Interface --------------------
111
+ def create_gradio_interface(issue_model, urgency_model):
112
+ def wrapped(ticket_text):
113
+ try:
114
+ issue, urgency = predict_ticket(ticket_text, issue_model, urgency_model)
115
+ return issue, urgency
116
+ except Exception as e:
117
+ return f"Error: {e}", ""
118
+
119
+ return gr.Interface(
120
+ fn=wrapped,
121
+ inputs=gr.Textbox(label="Ticket Text", lines=4),
122
+ outputs=[
123
+ gr.Textbox(label="Predicted Issue Type"),
124
+ gr.Textbox(label="Predicted Urgency Level")
125
+ ],
126
+ title="Support Ticket Classifier",
127
+ description="Enter a ticket to classify its issue type and urgency level."
128
+ )
129
+
130
+ # -------------------- 6. Main --------------------
131
+ if __name__ == "__main__":
132
+ df = load_data("ai_dev_assignment_tickets_complex_1000.xls")
133
+ df = preprocess_data(df)
134
+ df = feature_engineering(df)
135
+ issue_model, urgency_model = train_models(df)
136
+ iface = create_gradio_interface(issue_model, urgency_model)
137
 
138
+ #Deploy to public Gradio space (with temporary link)
139
+ iface.launch(share=True)