Spaces:

Satyam0077
/

CustomerSupportTicketClassifier

Sleeping

App Files Files Community

Satyam0077 commited on May 29, 2025

Commit

b5c1242

verified ·

1 Parent(s): 2c8827f

Upload 4 files

Browse files

Files changed (4) hide show

src/entity_extraction.py +56 -0
src/features.py +18 -0
src/model.py +21 -0
src/preprocessing.py +31 -0

src/entity_extraction.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# entity_extraction.py
+import re
+import dateparser
+# Extend your product list based on your dataset or domain
+PRODUCT_LIST = [
+    "productA", "productB", "productC", "laptop", "phone", "router", "headphones"
+]
+# Keywords indicating complaints or issues
+COMPLAINT_KEYWORDS = [
+    "broken", "late", "error", "delay", "fault", "not working", "slow", "missing", "haven’t received"
+]
+def extract_entities(text):
+    """
+    Extracts products, dates, and complaint keywords from the input text.
+    Args:
+        text (str): Customer support ticket text.
+    Returns:
+        dict: Dictionary with lists of extracted 'products', 'dates', and 'complaints'.
+    """
+    text_lower = text.lower()
+    # Product extraction - check presence of product keywords
+    products_found = [p for p in PRODUCT_LIST if p.lower() in text_lower]
+    # Date extraction - exact dates and fuzzy relative dates
+    date_phrases = re.findall(
+        r'\b(?:last week|yesterday|today|on \w+ \d{1,2}|\d{2}/\d{2}/\d{4})\b',
+        text_lower
+    )
+    # Filter only valid dates using dateparser
+    dates_found = [d for d in date_phrases if dateparser.parse(d)]
+    # Complaint extraction - check for complaint keywords
+    complaints_found = [word for word in COMPLAINT_KEYWORDS if word in text_lower]
+    return {
+        'products': products_found,
+        'dates': dates_found,
+        'complaints': complaints_found
+    }
+# Example usage
+if __name__ == "__main__":
+    sample_text = (
+        "I ordered a laptop last week but still haven’t received it. "
+        "This delay is frustrating and I need help."
+    )
+    entities = extract_entities(sample_text)
+    print("Extracted Entities:", entities)

src/features.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from sklearn.feature_extraction.text import TfidfVectorizer
+from textblob import TextBlob
+import numpy as np
+from scipy.sparse import hstack
+def create_features(df):
+    tfidf = TfidfVectorizer(max_features=1000)
+    X_tfidf = tfidf.fit_transform(df['clean_text'])
+    df['ticket_length'] = df['clean_text'].apply(lambda x: len(x.split()))
+    df['sentiment'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
+    X_features = hstack([
+        X_tfidf,
+        np.array(df['ticket_length']).reshape(-1, 1),
+        np.array(df['sentiment']).reshape(-1, 1)
+    ])
+    return X_features, tfidf

src/model.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report
+import joblib
+def train_and_evaluate(X, y, test_size=0.2, random_state=42):
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
+    model = LogisticRegression(max_iter=500)
+    model.fit(X_train, y_train)
+    y_pred = model.predict(X_test)
+    print(classification_report(y_test, y_pred))
+    return model
+def save_model(model, filename):
+    joblib.dump(model, filename)
+def load_model(filename):
+    return joblib.load(filename)

src/preprocessing.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import pandas as pd
+import re
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+nltk.download('stopwords')
+nltk.download('punkt')
+nltk.download('wordnet')
+stop_words = set(stopwords.words('english'))
+lemmatizer = WordNetLemmatizer()
+def clean_text(text):
+    if pd.isna(text):
+        return ""
+    text = text.lower()
+    text = re.sub(r'[^a-z0-9\s]', ' ', text)  # remove special chars
+    tokens = nltk.word_tokenize(text)
+    tokens = [w for w in tokens if w not in stop_words]
+    tokens = [lemmatizer.lemmatize(w) for w in tokens]
+    return " ".join(tokens)
+def load_and_preprocess_data(filepath):
+    df = pd.read_excel(filepath)
+    # Drop rows with missing critical labels
+    df = df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level'])
+    df['clean_text'] = df['ticket_text'].apply(clean_text)
+    # Fill missing product info with empty string
+    df['product'] = df['product'].fillna('')
+    return df