Spaces:

jiekarl
/

Task-1

Sleeping

File size: 5,481 Bytes

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import gradio as gr
import json

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# DataLoad and Preprocess
def load_data(file_path):
    df = pd.read_excel(file_path)
    print(f"Loaded data shape: {df.shape}")
    return df

def clean_text(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r'[^a-z0-9\s]', '', text.lower())

def tokenize_lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    return [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]

def preprocess_data(df):
    df[['ticket_text','issue_type','urgency_level']] = (
        df.groupby('product')[['ticket_text','issue_type','urgency_level']]
          .transform(lambda group: group.ffill().bfill())
    )
    df['clean_text'] = df['ticket_text'].apply(clean_text)
    df['processed_text'] = df['clean_text'].apply(lambda x: ' '.join(tokenize_lemmatize(x)))
    return df

# Feature Engineering
# Checking the sentimenet analysis on basees of different word like bad, good, late etc.
def simple_sentiment(text):
    pos = ['good', 'great', 'excellent', 'thanks']
    neg = ['bad', 'broken', 'late', 'error', 'issue', 'problem']
    tokens = text.split()
    return (sum(w in pos for w in tokens) - sum(w in neg for w in tokens)) / (len(tokens) or 1)

def feature_engineering(df):
    df['ticket_length'] = df['clean_text'].apply(len)
    df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))
    df['sentiment'] = df['clean_text'].apply(simple_sentiment)
    return df

# Train Models
# so here we are train the randomforest model.
# and we need to train the model as per the requirement issue_type and uregency_level
# also calculating the model performance by Classification Report
def train_models(df):
    X = df[['processed_text', 'ticket_length', 'word_count', 'sentiment']]
    y_issue = df['issue_type']
    y_urgency = df['urgency_level']

    X_train, X_test, y_issue_train, y_issue_test, y_urgency_train, y_urgency_test = train_test_split(
        X, y_issue, y_urgency, test_size=0.2, random_state=42
    )

    text_pipe = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=500))
    ])

    preprocessor = ColumnTransformer([
        ('text', text_pipe, 'processed_text'),
        ('numeric', 'passthrough', ['ticket_length', 'word_count', 'sentiment'])
    ])

    issue_model = Pipeline([
        ('pre', preprocessor),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

    urgency_model = Pipeline([
        ('pre', preprocessor),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

    issue_model.fit(X_train, y_issue_train)
    urgency_model.fit(X_train, y_urgency_train)

    print("Issue Classification:\n", classification_report(y_issue_test, issue_model.predict(X_test)))
    print("Urgency Classification:\n", classification_report(y_urgency_test, urgency_model.predict(X_test)))

    return issue_model, urgency_model

# Predict Single Ticket
def predict_ticket(ticket_text, issue_model, urgency_model):
    cleaned = clean_text(ticket_text)
    processed = ' '.join(tokenize_lemmatize(cleaned))
    features = pd.DataFrame([{
        'processed_text': processed,
        'ticket_length': len(cleaned),
        'word_count': len(cleaned.split()),
        'sentiment': simple_sentiment(cleaned)
    }])
    return issue_model.predict(features)[0], urgency_model.predict(features)[0]

# Generating the Gradio Interface  as per task
def create_gradio_interface(issue_model, urgency_model):
    def wrapped(ticket_text):
        try:
            issue, urgency = predict_ticket(ticket_text, issue_model, urgency_model)
            return issue, urgency
        except Exception as e:
            return f"Error: {e}", ""

    return gr.Interface(
        fn=wrapped,
        inputs=gr.Textbox(label="Ticket Text", lines=4),
        outputs=[
            gr.Textbox(label="Predicted Issue Type"),
            gr.Textbox(label="Predicted Urgency Level")
        ],
        title="Support Ticket Classifier",
        description="Enter a ticket to classify its issue type and urgency level.",
        examples=[
            ["payment issue with smartwatch v2, underbilled order 29224"],
            ["Router stopped working after update, need immediate help"],
            ["Received damaged headphones in shipment, request refund"],
            ["ordered smartwatch v2 got protab x1 instead order number 76301"],
            ["cant log account keep showing error help"],
            ["tell ecobreeze ac warranty also available black"]
        ]
    )



if __name__ == "__main__":
    df = load_data("ai_dev_assignment_tickets_complex_1000.xls")
    df = preprocess_data(df)
    df = feature_engineering(df)
    issue_model, urgency_model = train_models(df)
    iface = create_gradio_interface(issue_model, urgency_model)

    #Deploy to public Gradio space (with temporary link)
    iface.launch(share=True)