Task-1 / app.py
jiekarl's picture
Update app.py
fa34829 verified
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import gradio as gr
import json
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# DataLoad and Preprocess
def load_data(file_path):
df = pd.read_excel(file_path)
print(f"Loaded data shape: {df.shape}")
return df
def clean_text(text):
if not isinstance(text, str):
return ""
return re.sub(r'[^a-z0-9\s]', '', text.lower())
def tokenize_lemmatize(text):
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
return [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
def preprocess_data(df):
df[['ticket_text','issue_type','urgency_level']] = (
df.groupby('product')[['ticket_text','issue_type','urgency_level']]
.transform(lambda group: group.ffill().bfill())
)
df['clean_text'] = df['ticket_text'].apply(clean_text)
df['processed_text'] = df['clean_text'].apply(lambda x: ' '.join(tokenize_lemmatize(x)))
return df
# Feature Engineering
# Checking the sentimenet analysis on basees of different word like bad, good, late etc.
def simple_sentiment(text):
pos = ['good', 'great', 'excellent', 'thanks']
neg = ['bad', 'broken', 'late', 'error', 'issue', 'problem']
tokens = text.split()
return (sum(w in pos for w in tokens) - sum(w in neg for w in tokens)) / (len(tokens) or 1)
def feature_engineering(df):
df['ticket_length'] = df['clean_text'].apply(len)
df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))
df['sentiment'] = df['clean_text'].apply(simple_sentiment)
return df
# Train Models
# so here we are train the randomforest model.
# and we need to train the model as per the requirement issue_type and uregency_level
# also calculating the model performance by Classification Report
def train_models(df):
X = df[['processed_text', 'ticket_length', 'word_count', 'sentiment']]
y_issue = df['issue_type']
y_urgency = df['urgency_level']
X_train, X_test, y_issue_train, y_issue_test, y_urgency_train, y_urgency_test = train_test_split(
X, y_issue, y_urgency, test_size=0.2, random_state=42
)
text_pipe = Pipeline([
('tfidf', TfidfVectorizer(max_features=500))
])
preprocessor = ColumnTransformer([
('text', text_pipe, 'processed_text'),
('numeric', 'passthrough', ['ticket_length', 'word_count', 'sentiment'])
])
issue_model = Pipeline([
('pre', preprocessor),
('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])
urgency_model = Pipeline([
('pre', preprocessor),
('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])
issue_model.fit(X_train, y_issue_train)
urgency_model.fit(X_train, y_urgency_train)
print("Issue Classification:\n", classification_report(y_issue_test, issue_model.predict(X_test)))
print("Urgency Classification:\n", classification_report(y_urgency_test, urgency_model.predict(X_test)))
return issue_model, urgency_model
# Predict Single Ticket
def predict_ticket(ticket_text, issue_model, urgency_model):
cleaned = clean_text(ticket_text)
processed = ' '.join(tokenize_lemmatize(cleaned))
features = pd.DataFrame([{
'processed_text': processed,
'ticket_length': len(cleaned),
'word_count': len(cleaned.split()),
'sentiment': simple_sentiment(cleaned)
}])
return issue_model.predict(features)[0], urgency_model.predict(features)[0]
# Generating the Gradio Interface as per task
def create_gradio_interface(issue_model, urgency_model):
def wrapped(ticket_text):
try:
issue, urgency = predict_ticket(ticket_text, issue_model, urgency_model)
return issue, urgency
except Exception as e:
return f"Error: {e}", ""
return gr.Interface(
fn=wrapped,
inputs=gr.Textbox(label="Ticket Text", lines=4),
outputs=[
gr.Textbox(label="Predicted Issue Type"),
gr.Textbox(label="Predicted Urgency Level")
],
title="Support Ticket Classifier",
description="Enter a ticket to classify its issue type and urgency level.",
examples=[
["payment issue with smartwatch v2, underbilled order 29224"],
["Router stopped working after update, need immediate help"],
["Received damaged headphones in shipment, request refund"],
["ordered smartwatch v2 got protab x1 instead order number 76301"],
["cant log account keep showing error help"],
["tell ecobreeze ac warranty also available black"]
]
)
if __name__ == "__main__":
df = load_data("ai_dev_assignment_tickets_complex_1000.xls")
df = preprocess_data(df)
df = feature_engineering(df)
issue_model, urgency_model = train_models(df)
iface = create_gradio_interface(issue_model, urgency_model)
#Deploy to public Gradio space (with temporary link)
iface.launch(share=True)