Satyam0077 commited on
Commit
b5c1242
·
verified ·
1 Parent(s): 2c8827f

Upload 4 files

Browse files
Files changed (4) hide show
  1. src/entity_extraction.py +56 -0
  2. src/features.py +18 -0
  3. src/model.py +21 -0
  4. src/preprocessing.py +31 -0
src/entity_extraction.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # entity_extraction.py
2
+
3
+ import re
4
+ import dateparser
5
+
6
+ # Extend your product list based on your dataset or domain
7
+ PRODUCT_LIST = [
8
+ "productA", "productB", "productC", "laptop", "phone", "router", "headphones"
9
+ ]
10
+
11
+ # Keywords indicating complaints or issues
12
+ COMPLAINT_KEYWORDS = [
13
+ "broken", "late", "error", "delay", "fault", "not working", "slow", "missing", "haven’t received"
14
+ ]
15
+
16
+ def extract_entities(text):
17
+ """
18
+ Extracts products, dates, and complaint keywords from the input text.
19
+
20
+ Args:
21
+ text (str): Customer support ticket text.
22
+
23
+ Returns:
24
+ dict: Dictionary with lists of extracted 'products', 'dates', and 'complaints'.
25
+ """
26
+ text_lower = text.lower()
27
+
28
+ # Product extraction - check presence of product keywords
29
+ products_found = [p for p in PRODUCT_LIST if p.lower() in text_lower]
30
+
31
+ # Date extraction - exact dates and fuzzy relative dates
32
+ date_phrases = re.findall(
33
+ r'\b(?:last week|yesterday|today|on \w+ \d{1,2}|\d{2}/\d{2}/\d{4})\b',
34
+ text_lower
35
+ )
36
+ # Filter only valid dates using dateparser
37
+ dates_found = [d for d in date_phrases if dateparser.parse(d)]
38
+
39
+ # Complaint extraction - check for complaint keywords
40
+ complaints_found = [word for word in COMPLAINT_KEYWORDS if word in text_lower]
41
+
42
+ return {
43
+ 'products': products_found,
44
+ 'dates': dates_found,
45
+ 'complaints': complaints_found
46
+ }
47
+
48
+
49
+ # Example usage
50
+ if __name__ == "__main__":
51
+ sample_text = (
52
+ "I ordered a laptop last week but still haven’t received it. "
53
+ "This delay is frustrating and I need help."
54
+ )
55
+ entities = extract_entities(sample_text)
56
+ print("Extracted Entities:", entities)
src/features.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from textblob import TextBlob
3
+ import numpy as np
4
+ from scipy.sparse import hstack
5
+
6
+ def create_features(df):
7
+ tfidf = TfidfVectorizer(max_features=1000)
8
+ X_tfidf = tfidf.fit_transform(df['clean_text'])
9
+
10
+ df['ticket_length'] = df['clean_text'].apply(lambda x: len(x.split()))
11
+ df['sentiment'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
12
+
13
+ X_features = hstack([
14
+ X_tfidf,
15
+ np.array(df['ticket_length']).reshape(-1, 1),
16
+ np.array(df['sentiment']).reshape(-1, 1)
17
+ ])
18
+ return X_features, tfidf
src/model.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.linear_model import LogisticRegression
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.metrics import classification_report
4
+ import joblib
5
+
6
+ def train_and_evaluate(X, y, test_size=0.2, random_state=42):
7
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
8
+
9
+ model = LogisticRegression(max_iter=500)
10
+ model.fit(X_train, y_train)
11
+
12
+ y_pred = model.predict(X_test)
13
+ print(classification_report(y_test, y_pred))
14
+
15
+ return model
16
+
17
+ def save_model(model, filename):
18
+ joblib.dump(model, filename)
19
+
20
+ def load_model(filename):
21
+ return joblib.load(filename)
src/preprocessing.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from nltk.stem import WordNetLemmatizer
6
+
7
+ nltk.download('stopwords')
8
+ nltk.download('punkt')
9
+ nltk.download('wordnet')
10
+
11
+ stop_words = set(stopwords.words('english'))
12
+ lemmatizer = WordNetLemmatizer()
13
+
14
+ def clean_text(text):
15
+ if pd.isna(text):
16
+ return ""
17
+ text = text.lower()
18
+ text = re.sub(r'[^a-z0-9\s]', ' ', text) # remove special chars
19
+ tokens = nltk.word_tokenize(text)
20
+ tokens = [w for w in tokens if w not in stop_words]
21
+ tokens = [lemmatizer.lemmatize(w) for w in tokens]
22
+ return " ".join(tokens)
23
+
24
+ def load_and_preprocess_data(filepath):
25
+ df = pd.read_excel(filepath)
26
+ # Drop rows with missing critical labels
27
+ df = df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level'])
28
+ df['clean_text'] = df['ticket_text'].apply(clean_text)
29
+ # Fill missing product info with empty string
30
+ df['product'] = df['product'].fillna('')
31
+ return df