hansche commited on
Commit
cb4d679
·
verified ·
1 Parent(s): bcbbed8

Create preprocessor.py

Browse files
Files changed (1) hide show
  1. preprocessor.py +199 -0
preprocessor.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ import spacy
4
+ from langdetect import detect_langs
5
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
6
+ from sklearn.decomposition import LatentDirichletAllocation
7
+ from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
8
+ from spacy.lang.fr.stop_words import STOP_WORDS as FRENCH_STOP_WORDS
9
+ from sklearn.cluster import KMeans
10
+ from sklearn.manifold import TSNE
11
+ import numpy as np
12
+ import torch
13
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
14
+ import streamlit as st
15
+
16
+ # Lighter model
17
+ MODEL ="cardiffnlp/twitter-xlm-roberta-base-sentiment"
18
+
19
+ # Cache model loading with fallback for quantization
20
+ @st.cache_resource
21
+ def load_model():
22
+ device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ print(f"Using device: {device}")
24
+ tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
25
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
26
+
27
+ # Attempt quantization with fallback
28
+ try:
29
+ # Set quantization engine explicitly (fbgemm for x86, qnnpack for ARM)
30
+ torch.backends.quantized.engine = 'fbgemm' if torch.cuda.is_available() else 'qnnpack'
31
+ model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
32
+ print("Model quantized successfully.")
33
+ except RuntimeError as e:
34
+ print(f"Quantization failed: {e}. Using non-quantized model.")
35
+
36
+ config = AutoConfig.from_pretrained(MODEL)
37
+ return tokenizer, model, config, device
38
+
39
+ tokenizer, model, config, device = load_model()
40
+
41
+ nlp_fr = spacy.load("fr_core_news_sm")
42
+ nlp_en = spacy.load("en_core_web_sm")
43
+ custom_stop_words = list(ENGLISH_STOP_WORDS.union(FRENCH_STOP_WORDS))
44
+
45
+ def preprocess(text):
46
+ if text is None:
47
+ return ""
48
+ if not isinstance(text, str):
49
+ try:
50
+ text = str(text)
51
+ except:
52
+ return ""
53
+ new_text = []
54
+ for t in text.split(" "):
55
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
56
+ t = 'http' if t.startswith('http') else t
57
+ new_text.append(t)
58
+ return " ".join(new_text)
59
+
60
+ def clean_message(text):
61
+ if not isinstance(text, str):
62
+ return ""
63
+ text = text.lower()
64
+ text = text.replace("<media omitted>", "").replace("this message was deleted", "").replace("null", "")
65
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
66
+ text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s]", "", text)
67
+ return text.strip()
68
+
69
+ def lemmatize_text(text, lang):
70
+ if lang == 'fr':
71
+ doc = nlp_fr(text)
72
+ else:
73
+ doc = nlp_en(text)
74
+ return " ".join([token.lemma_ for token in doc if not token.is_punct])
75
+
76
+ def preprocess(data):
77
+ pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
78
+ filtered_messages, valid_dates = [], []
79
+
80
+ for line in data.strip().split("\n"):
81
+ match = re.match(pattern, line)
82
+ if match:
83
+ entry = match.groupdict()
84
+ sender = entry.get("Sender")
85
+ if sender and sender.strip().lower() != "system":
86
+ filtered_messages.append(f"{sender.strip()}: {entry['Message']}")
87
+ valid_dates.append(f"{entry['Date']}, {entry['Time'].replace(' ', ' ')}")
88
+
89
+ df = pd.DataFrame({'user_message': filtered_messages, 'message_date': valid_dates})
90
+ df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
91
+ df.rename(columns={'message_date': 'date'}, inplace=True)
92
+
93
+ users, messages = [], []
94
+ msg_pattern = r"^(.*?):\s(.*)$"
95
+ for message in df["user_message"]:
96
+ match = re.match(msg_pattern, message)
97
+ if match:
98
+ users.append(match.group(1))
99
+ messages.append(match.group(2))
100
+ else:
101
+ users.append("group_notification")
102
+ messages.append(message)
103
+
104
+ df["user"] = users
105
+ df["message"] = messages
106
+ df = df[df["user"] != "group_notification"].reset_index(drop=True)
107
+ df["unfiltered_messages"] = df["message"]
108
+ df["message"] = df["message"].apply(clean_message)
109
+
110
+ # Extract time-based features
111
+ df['year'] = pd.to_numeric(df['date'].dt.year, downcast='integer')
112
+ df['month'] = df['date'].dt.month_name()
113
+ df['day'] = pd.to_numeric(df['date'].dt.day, downcast='integer')
114
+ df['hour'] = pd.to_numeric(df['date'].dt.hour, downcast='integer')
115
+ df['day_of_week'] = df['date'].dt.day_name()
116
+
117
+ # Lemmatize messages for topic modeling
118
+ lemmatized_messages = []
119
+ for message in df["message"]:
120
+ try:
121
+ lang = detect_langs(message)
122
+ lemmatized_messages.append(lemmatize_text(message, lang))
123
+ except:
124
+ lemmatized_messages.append("")
125
+ df["lemmatized_message"] = lemmatized_messages
126
+
127
+ df = df[df["message"].notnull() & (df["message"] != "")].copy()
128
+ df.drop(columns=["user_message"], inplace=True)
129
+
130
+ # Perform topic modeling
131
+ vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
132
+ dtm = vectorizer.fit_transform(df['lemmatized_message'])
133
+
134
+ # Apply LDA
135
+ lda = LatentDirichletAllocation(n_components=5, random_state=42)
136
+ lda.fit(dtm)
137
+
138
+ # Assign topics to messages
139
+ topic_results = lda.transform(dtm)
140
+ df = df.iloc[:topic_results.shape[0]].copy()
141
+ df['topic'] = topic_results.argmax(axis=1)
142
+
143
+ # Store topics for visualization
144
+ topics = []
145
+ for topic in lda.components_:
146
+ topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
147
+ print("Top words for each topic-----------------------------------------------------:")
148
+ print(topics)
149
+
150
+ return df, topics
151
+
152
+ def preprocess_for_clustering(df, n_clusters=5):
153
+ df = df[df["lemmatized_message"].notnull() & (df["lemmatized_message"].str.strip() != "")]
154
+ df = df.reset_index(drop=True)
155
+
156
+ vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
157
+ tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
158
+
159
+ if tfidf_matrix.shape[0] < 2:
160
+ raise ValueError("Not enough messages for clustering.")
161
+
162
+ df = df.iloc[:tfidf_matrix.shape[0]].copy()
163
+
164
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
165
+ clusters = kmeans.fit_predict(tfidf_matrix)
166
+
167
+ df['cluster'] = clusters
168
+ tsne = TSNE(n_components=2, random_state=42)
169
+ reduced_features = tsne.fit_transform(tfidf_matrix.toarray())
170
+
171
+ return df, reduced_features, kmeans.cluster_centers_
172
+
173
+
174
+ def predict_sentiment_batch(texts: list, batch_size: int = 32) -> list:
175
+ """Predict sentiment for a batch of texts"""
176
+ if not isinstance(texts, list):
177
+ raise TypeError(f"Expected list of texts, got {type(texts)}")
178
+
179
+ processed_texts = [preprocess(text) for text in texts]
180
+
181
+ predictions = []
182
+ for i in range(0, len(processed_texts), batch_size):
183
+ batch = processed_texts[i:i+batch_size]
184
+
185
+ inputs = tokenizer(
186
+ batch,
187
+ padding=True,
188
+ truncation=True,
189
+ return_tensors="pt",
190
+ max_length=128
191
+ ).to(device)
192
+
193
+ with torch.no_grad():
194
+ outputs = model(**inputs)
195
+
196
+ batch_preds = outputs.logits.argmax(dim=1).cpu().numpy()
197
+ predictions.extend([config.id2label[p] for p in batch_preds])
198
+
199
+ return predictions