jiekarl commited on
Commit
06cf98c
·
verified ·
1 Parent(s): d1e437e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -224
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import pandas as pd
2
- import numpy as np
3
  import re
4
  import nltk
5
  from nltk.corpus import stopwords
@@ -10,281 +9,135 @@ from sklearn.model_selection import train_test_split
10
  from sklearn.metrics import classification_report
11
  from sklearn.pipeline import Pipeline
12
  from sklearn.compose import ColumnTransformer
13
- from sklearn.preprocessing import FunctionTransformer
14
- import json
15
  import gradio as gr
16
 
17
- # Download NLTK resources
18
  nltk.download('stopwords')
19
  nltk.download('wordnet')
20
  nltk.download('omw-1.4')
21
 
22
- # --------------------
23
- # 1. Data Preparation
24
- # --------------------
25
- def load_data(file_path):
26
- df = pd.read_excel('ai_dev_assignment_tickets_complex_1000.xls')
27
- print(f"Original data shape: {df.shape}")
28
- return df
29
 
30
  def clean_text(text):
31
  if not isinstance(text, str):
32
  return ""
33
- # Normalization
34
  text = text.lower()
35
  text = re.sub(r'[^a-z0-9\s]', '', text)
36
  return text
37
 
38
- def preprocess_data(df):
39
- # Handle missing data
40
- # df['ticket_text'] = df['ticket_text'].fillna('')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  df[['ticket_text','issue_type','urgency_level']] = (
42
- df.groupby('product')[['ticket_text', 'issue_type','urgency_level']]
43
- .transform(lambda group: group.ffill().bfill())
44
  )
45
-
46
- # Text cleaning
47
  df['clean_text'] = df['ticket_text'].apply(clean_text)
48
-
49
- # Tokenization and lemmatization
50
- lemmatizer = WordNetLemmatizer()
51
- stop_words = set(stopwords.words('english'))
52
-
53
- def tokenize_lemmatize(text):
54
- tokens = text.split()
55
- return [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
56
-
57
  df['processed_text'] = df['clean_text'].apply(tokenize_lemmatize).apply(' '.join)
58
- return df
59
-
60
- # --------------------
61
- # 2. Feature Engineering
62
- # --------------------
63
- def feature_engineering(df):
64
- # Text-based features
65
  df['ticket_length'] = df['clean_text'].apply(len)
66
  df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))
67
-
68
- # Sentiment score (simplified)
69
- def simple_sentiment(text):
70
- positive = ['good', 'great', 'excellent', 'thanks']
71
- negative = ['bad', 'broken', 'late', 'error', 'issue', 'problem']
72
- tokens = text.split()
73
- pos_count = sum(1 for word in tokens if word in positive)
74
- neg_count = sum(1 for word in tokens if word in negative)
75
- return (pos_count - neg_count) / len(tokens) if tokens else 0
76
-
77
  df['sentiment'] = df['clean_text'].apply(simple_sentiment)
78
- return df
79
 
80
- # --------------------
81
- # 3. Multi-Task Learning
82
- # --------------------
83
- def train_models(df):
84
- # Feature preparation
85
  X = df[['processed_text', 'ticket_length', 'word_count', 'sentiment']]
86
  y_issue = df['issue_type']
87
  y_urgency = df['urgency_level']
88
-
89
- # Train-test split
90
  X_train, X_test, y_issue_train, y_issue_test, y_urgency_train, y_urgency_test = train_test_split(
91
  X, y_issue, y_urgency, test_size=0.2, random_state=42
92
  )
93
-
94
- # Model pipelines
95
  text_transformer = Pipeline([
96
  ('tfidf', TfidfVectorizer(max_features=500))
97
  ])
98
-
99
  preprocessor = ColumnTransformer(
100
  transformers=[
101
  ('text', text_transformer, 'processed_text'),
102
  ('num', 'passthrough', ['ticket_length', 'word_count', 'sentiment'])
103
  ])
104
-
105
- # Issue type classifier
106
  issue_pipe = Pipeline([
107
  ('preprocessor', preprocessor),
108
  ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
109
  ])
110
-
111
- # Urgency classifier
112
  urgency_pipe = Pipeline([
113
  ('preprocessor', preprocessor),
114
  ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
115
  ])
116
-
117
- # Train models
118
  issue_pipe.fit(X_train, y_issue_train)
119
  urgency_pipe.fit(X_train, y_urgency_train)
120
-
121
- # Evaluate models
122
- print("Issue Type Classification Report:")
123
- print(classification_report(y_issue_test, issue_pipe.predict(X_test)))
124
-
125
- print("\nUrgency Level Classification Report:")
126
- print(classification_report(y_urgency_test, urgency_pipe.predict(X_test)))
127
-
128
  return issue_pipe, urgency_pipe
129
 
130
- # --------------------
131
- # 4. Entity Extraction
132
- # --------------------
133
- def extract_entities(text,product_list):
134
- # Initialize empty entities
135
- entities = {
136
- "product": [],
137
- "dates": [],
138
- "complaint_keywords": []
139
- }
140
-
141
- # Product extraction (simulated product list)
142
- product_list = ['phone', 'tablet', 'laptop', 'router', 'monitor', 'printer']
143
- for product in product_list:
144
- if re.search(rf"\b{product}\b", text, re.IGNORECASE):
145
- entities["product"].append(product)
146
-
147
- # Date extraction
148
- date_patterns = [
149
- r'\d{1,2}/\d{1,2}/\d{2,4}',
150
- r'\d{1,2}-\d{1,2}-\d{2,4}',
151
- r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* \d{1,2},? \d{4}\b'
152
- ]
153
- for pattern in date_patterns:
154
- entities["dates"].extend(re.findall(pattern, text))
155
-
156
- # Complaint keywords
157
- complaint_words = ['broken', 'damage', 'late', 'delay', 'error', 'fault',
158
- 'defect', 'issue', 'problem', 'not working', 'failed']
159
- for word in complaint_words:
160
- if re.search(rf"\b{word}\b", text, re.IGNORECASE):
161
- entities["complaint_keywords"].append(word)
162
-
163
- return entities
164
 
165
- # --------------------
166
- # 5. Integration
167
- # --------------------
168
- def process_ticket(ticket_text, issue_model, urgency_model):
169
- # Create input dataframe
170
- input_df = pd.DataFrame([{
171
- 'ticket_text': ticket_text,
172
- 'clean_text': clean_text(ticket_text),
173
- 'processed_text': ' '.join(tokenize_lemmatize(clean_text(ticket_text))),
174
- 'ticket_length': len(ticket_text),
175
- 'word_count': len(ticket_text.split()),
176
- 'sentiment': simple_sentiment(clean_text(ticket_text))
177
  }])
178
-
179
- # Predictions
180
- issue_pred = issue_model.predict(input_df)[0]
181
- urgency_pred = urgency_model.predict(input_df)[0]
182
-
183
- # Entity extraction
184
- entities = extract_entities(ticket_text)
185
-
186
- return {
187
- "issue_type": issue_pred,
188
- "urgency_level": urgency_pred,
189
- "entities": entities
190
- }
191
- # --------------------
192
- # Gradio Interface Creation
193
- # --------------------
194
- def create_gradio_interface(predict_fn):
195
- def wrapper(ticket_text):
196
- try:
197
- result = predict_fn(ticket_text)
198
- return (
199
- result["issue_type"],
200
- result["urgency_level"],
201
- json.dumps(result["entities"], indent=2)
202
- )
203
- except Exception as e:
204
- return f"Error: {str(e)}", "", "{}"
205
-
206
- iface = gr.Interface(
207
- fn=wrapper,
208
- inputs=gr.Textbox(label="Ticket Text", lines=5),
209
- outputs=[
210
- gr.Textbox(label="Issue Type"),
211
- gr.Textbox(label="Urgency Level"),
212
- gr.JSON(label="Extracted Entities")
213
- ],
214
- title="Customer Support Ticket Analyzer",
215
- description="Classify ticket issue type and urgency level, extract key entities",
216
- examples=[
217
- ["payment issue with smartwatch v2, underbilled order 29224"],
218
- ["Router stopped working after update, need immediate help"],
219
- ["Received damaged headphones in shipment, request refund"]
220
- ]
221
- )
222
- return iface
223
 
224
- # --------------------
225
- # Main Execution (Corrected)
226
- # --------------------
227
- if __name__ == "__main__":
228
- # Load data
229
- df = load_data("ai_dev_assignment_tickets_complex_1000.xlsx")
230
-
231
- # Generate product list from data
232
- all_products = df['product'].dropna().unique()
233
- product_list = set()
234
- for product in all_products:
235
- # Split multi-word products and clean
236
- words = re.split(r'\W+', str(product).lower())
237
- product_list.update([w for w in words if w and len(w) > 1])
238
-
239
- # Add common tech products
240
- product_list.update(['smartwatch', 'v2', 'v3', 'headphones', 'camera',
241
- 'phone', 'tablet', 'laptop', 'router', 'monitor', 'printer'])
242
-
243
- # Preprocess data
244
- df = preprocess_data(df)
245
- df = feature_engineering(df)
246
-
247
- # Train models
248
- issue_model, urgency_model = train_models(df)
249
-
250
- # Create a function that takes ticket_text and returns the result
251
- def updated_process_ticket(ticket_text):
252
- return process_ticket(ticket_text, issue_model, urgency_model, product_list)
253
-
254
- # Create Gradio interface with the function
255
- iface = create_gradio_interface(updated_process_ticket)
256
- iface.launch(server_name="0.0.0.0", server_port=7866)
257
 
258
- # --------------------
259
- # Integration Function
260
- # --------------------
261
- def process_ticket(ticket_text, issue_model, urgency_model, product_list):
262
- # Preprocess input
263
- cleaned = clean_text(ticket_text)
264
- tokenized = tokenize_lemmatize(cleaned)
265
- processed_text = ' '.join(tokenized)
266
-
267
- # Create input features
268
- input_df = pd.DataFrame([{
269
- 'ticket_text': ticket_text,
270
- 'clean_text': cleaned,
271
- 'processed_text': processed_text,
272
- 'ticket_length': len(cleaned),
273
- 'word_count': len(cleaned.split()),
274
- 'sentiment': simple_sentiment(cleaned)
275
- }])
276
-
277
- # Predictions
278
- issue_pred = issue_model.predict(input_df)[0]
279
- urgency_pred = urgency_model.predict(input_df)[0]
280
-
281
- # Entity extraction with generated product list
282
- entities = extract_entities(ticket_text, product_list)
283
-
284
  return {
285
- "issue_type": issue_pred,
286
- "urgency_level": urgency_pred,
287
- "entities": entities
288
  }
289
 
290
- #https://a60c2c3e8e37afc8af.gradio.live/
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
 
2
  import re
3
  import nltk
4
  from nltk.corpus import stopwords
 
9
  from sklearn.metrics import classification_report
10
  from sklearn.pipeline import Pipeline
11
  from sklearn.compose import ColumnTransformer
 
 
12
  import gradio as gr
13
 
 
14
  nltk.download('stopwords')
15
  nltk.download('wordnet')
16
  nltk.download('omw-1.4')
17
 
18
+ lemmatizer = WordNetLemmatizer()
19
+ stop_words = set(stopwords.words('english'))
 
 
 
 
 
20
 
21
  def clean_text(text):
22
  if not isinstance(text, str):
23
  return ""
 
24
  text = text.lower()
25
  text = re.sub(r'[^a-z0-9\s]', '', text)
26
  return text
27
 
28
+ def tokenize_lemmatize(text):
29
+ tokens = text.split()
30
+ return [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
31
+
32
+ def simple_sentiment(text):
33
+ positive = ['good', 'great', 'excellent', 'thanks']
34
+ negative = ['bad', 'broken', 'late', 'error', 'issue', 'problem']
35
+ tokens = text.split()
36
+ pos_count = sum(1 for word in tokens if word in positive)
37
+ neg_count = sum(1 for word in tokens if word in negative)
38
+ return (pos_count - neg_count) / len(tokens) if tokens else 0
39
+
40
+ def extract_entities(text):
41
+ entities = {"product": [], "dates": [], "complaint_keywords": []}
42
+ product_list = ['phone', 'tablet', 'laptop', 'router', 'monitor', 'printer']
43
+ for product in product_list:
44
+ if re.search(rf"\\b{product}\\b", text, re.IGNORECASE):
45
+ entities["product"].append(product)
46
+
47
+ date_patterns = [
48
+ r'\d{1,2}/\d{1,2}/\d{2,4}',
49
+ r'\d{1,2}-\d{1,2}-\d{2,4}',
50
+ r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* \d{1,2},? \d{4}\b'
51
+ ]
52
+ for pattern in date_patterns:
53
+ entities["dates"].extend(re.findall(pattern, text))
54
+
55
+ complaint_words = ['broken', 'damage', 'late', 'delay', 'error', 'fault',
56
+ 'defect', 'issue', 'problem', 'not working', 'failed']
57
+ for word in complaint_words:
58
+ if re.search(rf"\\b{word}\\b", text, re.IGNORECASE):
59
+ entities["complaint_keywords"].append(word)
60
+ return entities
61
+
62
+ def load_and_train():
63
+ df = pd.read_excel("ai_dev_assignment_tickets_complex_1000.xls")
64
  df[['ticket_text','issue_type','urgency_level']] = (
65
+ df.groupby('product')[['ticket_text', 'issue_type','urgency_level']]
66
+ .transform(lambda group: group.ffill().bfill())
67
  )
 
 
68
  df['clean_text'] = df['ticket_text'].apply(clean_text)
 
 
 
 
 
 
 
 
 
69
  df['processed_text'] = df['clean_text'].apply(tokenize_lemmatize).apply(' '.join)
 
 
 
 
 
 
 
70
  df['ticket_length'] = df['clean_text'].apply(len)
71
  df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))
 
 
 
 
 
 
 
 
 
 
72
  df['sentiment'] = df['clean_text'].apply(simple_sentiment)
 
73
 
 
 
 
 
 
74
  X = df[['processed_text', 'ticket_length', 'word_count', 'sentiment']]
75
  y_issue = df['issue_type']
76
  y_urgency = df['urgency_level']
77
+
 
78
  X_train, X_test, y_issue_train, y_issue_test, y_urgency_train, y_urgency_test = train_test_split(
79
  X, y_issue, y_urgency, test_size=0.2, random_state=42
80
  )
81
+
 
82
  text_transformer = Pipeline([
83
  ('tfidf', TfidfVectorizer(max_features=500))
84
  ])
85
+
86
  preprocessor = ColumnTransformer(
87
  transformers=[
88
  ('text', text_transformer, 'processed_text'),
89
  ('num', 'passthrough', ['ticket_length', 'word_count', 'sentiment'])
90
  ])
91
+
 
92
  issue_pipe = Pipeline([
93
  ('preprocessor', preprocessor),
94
  ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
95
  ])
96
+
 
97
  urgency_pipe = Pipeline([
98
  ('preprocessor', preprocessor),
99
  ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
100
  ])
101
+
 
102
  issue_pipe.fit(X_train, y_issue_train)
103
  urgency_pipe.fit(X_train, y_urgency_train)
104
+
 
 
 
 
 
 
 
105
  return issue_pipe, urgency_pipe
106
 
107
+ # Train the models at startup
108
+ issue_model, urgency_model = load_and_train()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ def gradio_interface(ticket_text):
111
+ clean = clean_text(ticket_text)
112
+ tokens = tokenize_lemmatize(clean)
113
+ processed = ' '.join(tokens)
114
+ features = pd.DataFrame([{
115
+ 'processed_text': processed,
116
+ 'ticket_length': len(clean),
117
+ 'word_count': len(clean.split()),
118
+ 'sentiment': simple_sentiment(clean)
 
 
 
119
  }])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ issue_pred = issue_model.predict(features)[0]
122
+ urgency_pred = urgency_model.predict(features)[0]
123
+ entities = extract_entities(ticket_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  return {
126
+ "Predicted Issue Type": issue_pred,
127
+ "Predicted Urgency Level": urgency_pred,
128
+ "Extracted Entities": entities
129
  }
130
 
131
+ iface = gr.Interface(
132
+ fn=gradio_interface,
133
+ inputs=gr.Textbox(lines=5, placeholder="Enter ticket text here..."),
134
+ outputs=[
135
+ gr.Textbox(label="Predicted Issue Type"),
136
+ gr.Textbox(label="Predicted Urgency Level"),
137
+ gr.JSON(label="Extracted Entities")
138
+ ],
139
+ title="Support Ticket Classifier",
140
+ description="Classifies support ticket issue type and urgency, and extracts key entities."
141
+ )
142
+
143
+ iface.launch()