import pandas as pd import numpy as np import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer import gradio as gr import json # Download NLTK resources nltk.download('stopwords') nltk.download('wordnet') nltk.download('omw-1.4') # DataLoad and Preprocess def load_data(file_path): df = pd.read_excel(file_path) print(f"Loaded data shape: {df.shape}") return df def clean_text(text): if not isinstance(text, str): return "" return re.sub(r'[^a-z0-9\s]', '', text.lower()) def tokenize_lemmatize(text): lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) return [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words] def preprocess_data(df): df[['ticket_text','issue_type','urgency_level']] = ( df.groupby('product')[['ticket_text','issue_type','urgency_level']] .transform(lambda group: group.ffill().bfill()) ) df['clean_text'] = df['ticket_text'].apply(clean_text) df['processed_text'] = df['clean_text'].apply(lambda x: ' '.join(tokenize_lemmatize(x))) return df # Feature Engineering # Checking the sentimenet analysis on basees of different word like bad, good, late etc. def simple_sentiment(text): pos = ['good', 'great', 'excellent', 'thanks'] neg = ['bad', 'broken', 'late', 'error', 'issue', 'problem'] tokens = text.split() return (sum(w in pos for w in tokens) - sum(w in neg for w in tokens)) / (len(tokens) or 1) def feature_engineering(df): df['ticket_length'] = df['clean_text'].apply(len) df['word_count'] = df['clean_text'].apply(lambda x: len(x.split())) df['sentiment'] = df['clean_text'].apply(simple_sentiment) return df # Train Models # so here we are train the randomforest model. # and we need to train the model as per the requirement issue_type and uregency_level # also calculating the model performance by Classification Report def train_models(df): X = df[['processed_text', 'ticket_length', 'word_count', 'sentiment']] y_issue = df['issue_type'] y_urgency = df['urgency_level'] X_train, X_test, y_issue_train, y_issue_test, y_urgency_train, y_urgency_test = train_test_split( X, y_issue, y_urgency, test_size=0.2, random_state=42 ) text_pipe = Pipeline([ ('tfidf', TfidfVectorizer(max_features=500)) ]) preprocessor = ColumnTransformer([ ('text', text_pipe, 'processed_text'), ('numeric', 'passthrough', ['ticket_length', 'word_count', 'sentiment']) ]) issue_model = Pipeline([ ('pre', preprocessor), ('clf', RandomForestClassifier(n_estimators=100, random_state=42)) ]) urgency_model = Pipeline([ ('pre', preprocessor), ('clf', RandomForestClassifier(n_estimators=100, random_state=42)) ]) issue_model.fit(X_train, y_issue_train) urgency_model.fit(X_train, y_urgency_train) print("Issue Classification:\n", classification_report(y_issue_test, issue_model.predict(X_test))) print("Urgency Classification:\n", classification_report(y_urgency_test, urgency_model.predict(X_test))) return issue_model, urgency_model # Predict Single Ticket def predict_ticket(ticket_text, issue_model, urgency_model): cleaned = clean_text(ticket_text) processed = ' '.join(tokenize_lemmatize(cleaned)) features = pd.DataFrame([{ 'processed_text': processed, 'ticket_length': len(cleaned), 'word_count': len(cleaned.split()), 'sentiment': simple_sentiment(cleaned) }]) return issue_model.predict(features)[0], urgency_model.predict(features)[0] # Generating the Gradio Interface as per task def create_gradio_interface(issue_model, urgency_model): def wrapped(ticket_text): try: issue, urgency = predict_ticket(ticket_text, issue_model, urgency_model) return issue, urgency except Exception as e: return f"Error: {e}", "" return gr.Interface( fn=wrapped, inputs=gr.Textbox(label="Ticket Text", lines=4), outputs=[ gr.Textbox(label="Predicted Issue Type"), gr.Textbox(label="Predicted Urgency Level") ], title="Support Ticket Classifier", description="Enter a ticket to classify its issue type and urgency level.", examples=[ ["payment issue with smartwatch v2, underbilled order 29224"], ["Router stopped working after update, need immediate help"], ["Received damaged headphones in shipment, request refund"], ["ordered smartwatch v2 got protab x1 instead order number 76301"], ["cant log account keep showing error help"], ["tell ecobreeze ac warranty also available black"] ] ) if __name__ == "__main__": df = load_data("ai_dev_assignment_tickets_complex_1000.xls") df = preprocess_data(df) df = feature_engineering(df) issue_model, urgency_model = train_models(df) iface = create_gradio_interface(issue_model, urgency_model) #Deploy to public Gradio space (with temporary link) iface.launch(share=True)