PixelPoppie commited on
Commit
9fbbb5c
·
1 Parent(s): 62ee460

updated model, updated app, only used tf-idf

Browse files
MfrID_encoder.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f68f8424f6f855b23273b2e4cdd7c73d1110d951c51bb44909acbaacebf7dafd
3
- size 2862436
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a687c15c69eeafc5c1145fd198ed9cb06d71c5732b06239c942744df1b15e79
3
+ size 8730949
SPSC_encoder.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d3702df24056d5827adbb6c96b276893af28aa604678945ef0a2edf1bea583d
3
- size 68
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23b27e44a468816577f36b6ee4ee835fb12bdb57bcab2792fd0bae737c1a5f3a
3
+ size 5
app.py CHANGED
@@ -10,333 +10,170 @@ import nltk
10
  import os
11
  import re
12
  import logging
13
- import gensim
14
- import os
15
  from sklearn.feature_extraction.text import TfidfVectorizer
16
  from sklearn.decomposition import TruncatedSVD
17
  from nltk.tokenize import word_tokenize
18
  from huggingface_hub import hf_hub_download
19
 
20
  # Download the 'punkt' tokenizer
21
- nltk.download('punkt')
22
 
23
  # Set up logging
24
  logging.basicConfig(level=logging.DEBUG)
25
  logger = logging.getLogger(__name__)
26
 
27
- # Download NLTK data
28
- nltk.download('punkt')
29
-
30
  # Get Hugging Face access token from environment variable
31
  access_token = os.getenv('HUGGINGFACE_HUB_TOKEN')
32
 
33
  # Set the repository IDs
34
- model_repo_id = 'apoppie/camcor_rf_20240929_174751'
35
 
36
  # Load the trained Random Forest model from Hugging Face Hub
37
  model_file = hf_hub_download(
38
  repo_id=model_repo_id,
39
- filename='random_forest_model_20240929_174751.pkl',
40
  use_auth_token=access_token,
41
  resume_download=True
42
  )
43
- with open(model_file, 'rb') as f:
44
- rf_model = pickle.load(f)
45
 
46
- # Assuming encoder and vectorizer .pkl files are in the same directory as app.py
47
  current_dir = os.path.dirname(os.path.abspath(__file__))
48
 
49
- # Load MfrID encoder from local directory
50
- mfrid_encoder_path = os.path.join(current_dir, 'MfrID_encoder.pkl')
51
  with open(mfrid_encoder_path, 'rb') as f:
52
  mfrid_encoder = pickle.load(f)
53
 
54
- # Load SPSC encoder from local directory
55
- spsc_encoder_path = os.path.join(current_dir, 'SPSC_encoder.pkl')
56
  with open(spsc_encoder_path, 'rb') as f:
57
  spsc_encoder = pickle.load(f)
58
 
59
- # Load SVD model
60
- with open(os.path.join(current_dir, 'svd_model.pkl'), 'rb') as f:
61
- svd_model = pickle.load(f)
62
-
63
- # Load TF-IDF vectorizer
64
- with open(os.path.join(current_dir, 'tfidf_vectorizer.pkl'), 'rb') as f:
65
- tfidf_vectorizer = pickle.load(f)
66
-
67
- # Load Word2Vec model
68
- w2v_model = gensim.models.Word2Vec.load(os.path.join(current_dir, 'w2v_model.model'))
69
-
70
- def binarize_spsc(spsc_codes, spsc_encoder):
71
- """Convert list of SPSC codes to binary values using the provided encoder"""
72
- out = []
73
- format_string = spsc_encoder['format_string']
74
- bits = spsc_encoder['bits']
75
- for v in spsc_codes:
76
- try:
77
- v = int(v) if not pd.isna(v) else 0 # Handle NaN
78
- enc = format_string.format(v)
79
- out.append([int(c) for c in enc])
80
- except Exception:
81
- out.append([0] * bits) # Use zero vector for invalid entries
82
- return out
83
-
84
- def encode_mfrid(mfrid_list, mfrid_encoder):
85
- """Encode MfrID using the provided mapping and convert to binary"""
86
- mfrid_to_int = mfrid_encoder # Assuming mfrid_encoder is a dict
87
- bits = max(mfrid_to_int.values()).bit_length()
88
- mfrid_list = ['NaN' if pd.isna(x) else str(x) for x in mfrid_list]
89
- encoded_mfrid = [mfrid_to_int.get(mfrid, mfrid_to_int.get('NaN', 0)) for mfrid in mfrid_list]
90
- binary_encoded = [list(map(int, format(code, f'0{bits}b'))) for code in encoded_mfrid]
91
- return binary_encoded
92
 
93
- def get_doc_vector(doc_tokens):
94
- word_vectors = [w2v_model.wv[word] for word in doc_tokens if word in w2v_model.wv]
95
- return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(w2v_model.vector_size)
96
 
97
  def preprocess_json_content(content):
98
- logger.debug("Original content: %s", content[:100]) # Log first 100 characters
99
-
100
- # Remove any leading/trailing whitespace
101
  content = content.strip()
102
- logger.debug("After stripping: %s", content[:100])
103
-
104
- # Handle case where content is wrapped in extra quotes
105
  if content.startswith('"') and content.endswith('"'):
106
  content = content[1:-1]
107
- logger.debug("After removing extra quotes: %s", content[:100])
108
-
109
- # Ensure content is wrapped in square brackets
110
  if not content.startswith('['):
111
  content = '[' + content
112
  if not content.endswith(']'):
113
  content = content + ']'
114
- logger.debug("After adding brackets: %s", content[:100])
115
-
116
- # Replace any trailing commas before closing brackets
117
  content = re.sub(r',\s*}', '}', content)
118
  content = re.sub(r',\s*]', ']', content)
119
- logger.debug("After removing trailing commas: %s", content[:100])
120
-
121
  return content
122
 
 
 
 
 
 
 
 
 
123
  def parse_json_line_by_line(content):
124
  data = []
125
- for i, line in enumerate(content.split('\n')):
126
  line = line.strip()
127
  if line:
128
  try:
129
- # Handle lines that start with comma
130
  if line.startswith(','):
131
  line = line[1:]
132
- # Handle lines that are wrapped in square brackets
133
  if line.startswith('[') and line.endswith(']'):
134
  line = line[1:-1]
135
  obj = json.loads(line)
136
  data.append(obj)
137
  except json.JSONDecodeError:
138
- logger.warning(f"Failed to parse line {i+1}: {line}")
139
  return data
140
 
141
  def extract_features(df):
142
- """Extract required features from the DataFrame"""
143
  required_features = ['Description', 'SPSC', 'MfrID']
144
  extracted_features = {}
145
  missing_features = []
146
 
147
  for feature in required_features:
148
- # Try to find a column that contains the feature name (case-insensitive)
149
  matching_columns = [col for col in df.columns if feature.lower() in col.lower()]
150
  if matching_columns:
151
  extracted_features[feature] = df[matching_columns[0]].fillna('').astype(str).tolist()
152
  else:
153
  missing_features.append(feature)
154
- extracted_features[feature] = [''] * len(df) # Add empty placeholder
155
 
156
  return extracted_features, missing_features
157
 
158
- import json
159
- import numpy as np
160
- import pandas as pd
161
- import traceback
162
- import logging
163
- from nltk.tokenize import word_tokenize
 
 
164
 
165
- logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  def predict_from_json(json_file, confidence_threshold=0.7):
168
  try:
169
- # Read and parse JSON content
170
- content = read_json_file(json_file)
171
  data = parse_json_content(content)
172
 
173
  if not data:
174
- logger.error("No valid JSON objects found in the input")
175
  return "No valid JSON objects found in the input. Please check your JSON format."
176
 
177
- # Convert to DataFrame
178
  df = pd.DataFrame(data)
179
  logger.info(f"Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
180
 
181
- # Extract features
182
  extracted_features, missing_features = extract_features(df)
183
 
184
- # Warn about missing features
185
  if missing_features:
186
  logger.warning(f"Missing features: {', '.join(missing_features)}")
187
 
188
- # Preprocess features
189
- try:
190
- description_features = preprocess_descriptions(extracted_features['Description'])
191
- except Exception as e:
192
- logger.error(f"Error in preprocessing descriptions: {str(e)}")
193
- description_features = np.zeros((len(extracted_features['Description']), svd_model.n_components + w2v_model.vector_size))
194
 
195
- spsc_array = preprocess_spsc(extracted_features['SPSC'])
196
- mfrid_array = preprocess_mfrid(extracted_features['MfrID'])
197
-
198
- # Combine all features
199
  X = np.hstack([description_features, spsc_array, mfrid_array])
200
 
201
- # Make predictions with probabilities
202
  predictions = rf_model.predict(X)
203
  probabilities = rf_model.predict_proba(X)
204
  max_probabilities = np.max(probabilities, axis=1)
205
 
206
- # Add predictions and confidence to the DataFrame
207
  df['Prediction'] = predictions
208
  df['Confidence'] = max_probabilities
209
  df['Class'] = df.apply(lambda row: row['Prediction'] if row['Confidence'] >= confidence_threshold else "Needs Human Review", axis=1)
210
 
211
- # Select columns for output, including only available columns
212
  output_columns = ['SKU', 'Description', 'SPSC', 'MfrID', 'Class', 'Confidence']
213
  available_columns = [col for col in output_columns if col in df.columns]
214
  output_df = df[available_columns]
215
 
216
- # Return the DataFrame as a CSV string
217
- csv_result = output_df.to_csv(index=False)
218
- return csv_result
219
 
220
  except Exception as e:
221
  logger.error("An error occurred", exc_info=True)
222
  error_message = ''.join(traceback.format_exception(None, e, e.__traceback__))
223
  return f"An error occurred:\n{error_message}"
224
 
225
- def read_json_file(json_file):
226
- if isinstance(json_file, str):
227
- with open(json_file, 'r') as file:
228
- return file.read()
229
- elif hasattr(json_file, 'read'):
230
- content = json_file.read()
231
- return content.decode('utf-8') if isinstance(content, bytes) else content
232
- else:
233
- raise ValueError("Invalid input type. Expected file path or file-like object.")
234
-
235
- def parse_json_content(content):
236
- content = preprocess_json_content(content)
237
- try:
238
- data = json.loads(content)
239
- return [data] if not isinstance(data, list) else data
240
- except json.JSONDecodeError:
241
- return parse_json_line_by_line(content)
242
-
243
- def preprocess_json_content(content):
244
- content = content.strip()
245
- if content.startswith('"') and content.endswith('"'):
246
- content = content[1:-1]
247
- if not content.startswith('['):
248
- content = '[' + content
249
- if not content.endswith(']'):
250
- content = content + ']'
251
- content = content.replace(',]', ']').replace(',}', '}')
252
- return content
253
-
254
- def parse_json_line_by_line(content):
255
- data = []
256
- for line in content.split('\n'):
257
- line = line.strip()
258
- if line:
259
- try:
260
- if line.startswith(','):
261
- line = line[1:]
262
- if line.startswith('[') and line.endswith(']'):
263
- line = line[1:-1]
264
- obj = json.loads(line)
265
- data.append(obj)
266
- except json.JSONDecodeError:
267
- logger.warning(f"Failed to parse line: {line}")
268
- return data
269
-
270
- def extract_features(df):
271
- """Extract required features from the DataFrame"""
272
- required_features = ['Description', 'SPSC', 'MfrID']
273
- extracted_features = {}
274
- missing_features = []
275
-
276
- for feature in required_features:
277
- # Try to find a column that contains the feature name (case-insensitive)
278
- matching_columns = [col for col in df.columns if feature.lower() in col.lower()]
279
- if matching_columns:
280
- extracted_features[feature] = df[matching_columns[0]].fillna('').astype(str).tolist()
281
- else:
282
- missing_features.append(feature)
283
- extracted_features[feature] = [''] * len(df) # Add empty placeholder
284
-
285
- return extracted_features, missing_features
286
-
287
- def preprocess_descriptions(descriptions):
288
- try:
289
- # TF-IDF transformation
290
- tfidf_matrix = tfidf_vectorizer.transform(descriptions)
291
-
292
- # SVD transformation
293
- tfidf_svd = svd_model.transform(tfidf_matrix)
294
-
295
- # Word2Vec processing
296
- tokenized_descriptions = [word_tokenize(str(desc).lower()) for desc in descriptions]
297
- doc_vectors = np.array([get_doc_vector(tokens) for tokens in tokenized_descriptions])
298
-
299
- # Combine TF-IDF-SVD and Word2Vec features
300
- return np.hstack([tfidf_svd, doc_vectors])
301
- except Exception as e:
302
- logger.error(f"Error in preprocess_descriptions: {str(e)}")
303
- # Return a zero vector of appropriate size as a fallback
304
- return np.zeros((len(descriptions), svd_model.n_components + w2v_model.vector_size))
305
-
306
- def preprocess_spsc(spsc_codes):
307
- return np.array(binarize_spsc(spsc_codes, spsc_encoder))
308
-
309
- def preprocess_mfrid(mfrid_list):
310
- return np.array(encode_mfrid(mfrid_list, mfrid_encoder))
311
-
312
- def get_doc_vector(doc_tokens):
313
- try:
314
- word_vectors = [w2v_model.wv[word] for word in doc_tokens if word in w2v_model.wv]
315
- return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(w2v_model.vector_size)
316
- except Exception as e:
317
- logger.error(f"Error in get_doc_vector: {str(e)}")
318
- return np.zeros(w2v_model.vector_size)
319
-
320
- def binarize_spsc(spsc_codes, spsc_encoder):
321
- out = []
322
- format_string = spsc_encoder['format_string']
323
- bits = spsc_encoder['bits']
324
- for v in spsc_codes:
325
- try:
326
- v = int(v) if not pd.isna(v) else 0
327
- enc = format_string.format(v)
328
- out.append([int(c) for c in enc])
329
- except Exception:
330
- out.append([0] * bits)
331
- return out
332
-
333
- def encode_mfrid(mfrid_list, mfrid_encoder):
334
- mfrid_to_int = mfrid_encoder
335
- bits = max(mfrid_to_int.values()).bit_length()
336
- mfrid_list = ['NaN' if pd.isna(x) else str(x) for x in mfrid_list]
337
- encoded_mfrid = [mfrid_to_int.get(mfrid, mfrid_to_int.get('NaN', 0)) for mfrid in mfrid_list]
338
- return [list(map(int, format(code, f'0{bits}b'))) for code in encoded_mfrid]
339
-
340
  # Create the Gradio Interface
341
  inputs = [
342
  gr.File(label="Upload JSON File", file_types=['.json']),
@@ -344,12 +181,9 @@ inputs = [
344
  ]
345
  outputs = gr.Textbox(label="Predictions (CSV Format)")
346
 
347
- # Define title and description for the Gradio interface
348
  title = "Camcor Item Prediction Model"
349
- description = "A machine learning model that predicts item categories based on product descriptions."
350
 
351
- # Ensure that the Gradio Interface has title and description
352
  demo = gr.Interface(fn=predict_from_json, inputs=inputs, outputs=outputs, title=title, description=description)
353
 
354
-
355
  demo.launch()
 
10
  import os
11
  import re
12
  import logging
13
+ import scipy.sparse
 
14
  from sklearn.feature_extraction.text import TfidfVectorizer
15
  from sklearn.decomposition import TruncatedSVD
16
  from nltk.tokenize import word_tokenize
17
  from huggingface_hub import hf_hub_download
18
 
19
  # Download the 'punkt' tokenizer
20
+ nltk.download('punkt', quiet=True)
21
 
22
  # Set up logging
23
  logging.basicConfig(level=logging.DEBUG)
24
  logger = logging.getLogger(__name__)
25
 
 
 
 
26
  # Get Hugging Face access token from environment variable
27
  access_token = os.getenv('HUGGINGFACE_HUB_TOKEN')
28
 
29
  # Set the repository IDs
30
+ model_repo_id = 'apoppie/random_forest_model_20241005_202610'
31
 
32
  # Load the trained Random Forest model from Hugging Face Hub
33
  model_file = hf_hub_download(
34
  repo_id=model_repo_id,
35
+ filename='random_forest_model_20241005_202610.joblib',
36
  use_auth_token=access_token,
37
  resume_download=True
38
  )
39
+ rf_model = joblib.load(model_file)
 
40
 
41
+ # Load other necessary files
42
  current_dir = os.path.dirname(os.path.abspath(__file__))
43
 
44
+ mfrid_encoder_path = os.path.join(current_dir, 'mfrid_encoder.pkl')
 
45
  with open(mfrid_encoder_path, 'rb') as f:
46
  mfrid_encoder = pickle.load(f)
47
 
48
+ spsc_encoder_path = os.path.join(current_dir, 'spsc_encoder.pkl')
 
49
  with open(spsc_encoder_path, 'rb') as f:
50
  spsc_encoder = pickle.load(f)
51
 
52
+ tfidf_vectorizer_path = os.path.join(current_dir, 'tfidf_vectorizer.joblib')
53
+ tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ svd_model_path = os.path.join(current_dir, 'svd_model.joblib')
56
+ svd_model = joblib.load(svd_model_path)
 
57
 
58
  def preprocess_json_content(content):
 
 
 
59
  content = content.strip()
 
 
 
60
  if content.startswith('"') and content.endswith('"'):
61
  content = content[1:-1]
 
 
 
62
  if not content.startswith('['):
63
  content = '[' + content
64
  if not content.endswith(']'):
65
  content = content + ']'
 
 
 
66
  content = re.sub(r',\s*}', '}', content)
67
  content = re.sub(r',\s*]', ']', content)
 
 
68
  return content
69
 
70
+ def parse_json_content(content):
71
+ content = preprocess_json_content(content)
72
+ try:
73
+ data = json.loads(content)
74
+ return [data] if not isinstance(data, list) else data
75
+ except json.JSONDecodeError:
76
+ return parse_json_line_by_line(content)
77
+
78
  def parse_json_line_by_line(content):
79
  data = []
80
+ for line in content.split('\n'):
81
  line = line.strip()
82
  if line:
83
  try:
 
84
  if line.startswith(','):
85
  line = line[1:]
 
86
  if line.startswith('[') and line.endswith(']'):
87
  line = line[1:-1]
88
  obj = json.loads(line)
89
  data.append(obj)
90
  except json.JSONDecodeError:
91
+ logger.warning(f"Failed to parse line: {line}")
92
  return data
93
 
94
  def extract_features(df):
 
95
  required_features = ['Description', 'SPSC', 'MfrID']
96
  extracted_features = {}
97
  missing_features = []
98
 
99
  for feature in required_features:
 
100
  matching_columns = [col for col in df.columns if feature.lower() in col.lower()]
101
  if matching_columns:
102
  extracted_features[feature] = df[matching_columns[0]].fillna('').astype(str).tolist()
103
  else:
104
  missing_features.append(feature)
105
+ extracted_features[feature] = [''] * len(df)
106
 
107
  return extracted_features, missing_features
108
 
109
+ def preprocess_descriptions(descriptions):
110
+ try:
111
+ tfidf_matrix = tfidf_vectorizer.transform(descriptions)
112
+ tfidf_svd = svd_model.transform(tfidf_matrix)
113
+ return tfidf_svd
114
+ except Exception as e:
115
+ logger.error(f"Error in preprocess_descriptions: {str(e)}")
116
+ return np.zeros((len(descriptions), svd_model.n_components_))
117
 
118
+ def binarize_spsc(spsc_codes):
119
+ out = []
120
+ bits = spsc_encoder
121
+ for v in spsc_codes:
122
+ try:
123
+ v = int(v) if not pd.isna(v) else 0
124
+ enc = format(v, f'0{bits}b')
125
+ out.append([int(c) for c in enc])
126
+ except Exception:
127
+ out.append([0] * bits)
128
+ return out
129
+
130
+ def encode_mfrid(mfrid_list):
131
+ bits = max(mfrid_encoder.values()).bit_length()
132
+ mfrid_list = ['NaN' if pd.isna(x) else str(x) for x in mfrid_list]
133
+ encoded_mfrid = [mfrid_encoder.get(mfrid, mfrid_encoder.get('NaN', 0)) for mfrid in mfrid_list]
134
+ return [list(map(int, format(code, f'0{bits}b'))) for code in encoded_mfrid]
135
 
136
  def predict_from_json(json_file, confidence_threshold=0.7):
137
  try:
138
+ content = json_file.read().decode('utf-8') if hasattr(json_file, 'read') else json_file
 
139
  data = parse_json_content(content)
140
 
141
  if not data:
 
142
  return "No valid JSON objects found in the input. Please check your JSON format."
143
 
 
144
  df = pd.DataFrame(data)
145
  logger.info(f"Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
146
 
 
147
  extracted_features, missing_features = extract_features(df)
148
 
 
149
  if missing_features:
150
  logger.warning(f"Missing features: {', '.join(missing_features)}")
151
 
152
+ description_features = preprocess_descriptions(extracted_features['Description'])
153
+ spsc_array = np.array(binarize_spsc(extracted_features['SPSC']))
154
+ mfrid_array = np.array(encode_mfrid(extracted_features['MfrID']))
 
 
 
155
 
 
 
 
 
156
  X = np.hstack([description_features, spsc_array, mfrid_array])
157
 
 
158
  predictions = rf_model.predict(X)
159
  probabilities = rf_model.predict_proba(X)
160
  max_probabilities = np.max(probabilities, axis=1)
161
 
 
162
  df['Prediction'] = predictions
163
  df['Confidence'] = max_probabilities
164
  df['Class'] = df.apply(lambda row: row['Prediction'] if row['Confidence'] >= confidence_threshold else "Needs Human Review", axis=1)
165
 
 
166
  output_columns = ['SKU', 'Description', 'SPSC', 'MfrID', 'Class', 'Confidence']
167
  available_columns = [col for col in output_columns if col in df.columns]
168
  output_df = df[available_columns]
169
 
170
+ return output_df.to_csv(index=False)
 
 
171
 
172
  except Exception as e:
173
  logger.error("An error occurred", exc_info=True)
174
  error_message = ''.join(traceback.format_exception(None, e, e.__traceback__))
175
  return f"An error occurred:\n{error_message}"
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  # Create the Gradio Interface
178
  inputs = [
179
  gr.File(label="Upload JSON File", file_types=['.json']),
 
181
  ]
182
  outputs = gr.Textbox(label="Predictions (CSV Format)")
183
 
 
184
  title = "Camcor Item Prediction Model"
185
+ description = "A machine learning model that predicts item categories based on product descriptions, SPSC codes, and Manufacturer IDs."
186
 
 
187
  demo = gr.Interface(fn=predict_from_json, inputs=inputs, outputs=outputs, title=title, description=description)
188
 
 
189
  demo.launch()
description_engineering_models.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8c02f0679219d9a502b311fe531c903b1ee92a850aa078f3c0fe803a8319653
3
- size 192414565
 
 
 
 
requirements.txt CHANGED
@@ -3,8 +3,6 @@ pandas
3
  numpy
4
  scikit-learn==1.3.1
5
  nltk
6
- gensim
7
- matplotlib
8
- seaborn
9
- tqdm
10
  huggingface_hub
 
 
3
  numpy
4
  scikit-learn==1.3.1
5
  nltk
6
+ scipy
 
 
 
7
  huggingface_hub
8
+ joblib
setup.sh CHANGED
@@ -1,4 +1,2 @@
1
- # setup.sh
2
- python -c "import nltk; nltk.download('punkt')"
3
-
4
  python -m nltk.downloader punkt
 
1
+ #!/bin/bash
 
 
2
  python -m nltk.downloader punkt
w2v_model.model.syn1neg.npy → svd_model.joblib RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b74d6285b82c2b7fb6c501a3bb5179dfc2d37de0befaf1734bf5d9c7624dbe17
3
- size 86028128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a0652f61e8da1fbc7cf5a62f3a91cf931524a7a350d576bdd6b6ee28350d260
3
+ size 16010343
svd_model.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef6a3f0eaef3bb823e25f85cf234abe32d95917f754f06e8d1d7c7652f696d6a
3
- size 12007799
 
 
 
 
tfidf_vectorizer.pkl → tfidf_vectorizer.joblib RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a95c5adc47bd65250c271f4d3e6b12822c26206fe1997ef1f7e82e30f2f13f27
3
- size 191654
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4df4279bc9d00aaae93a11f3036008a4c90a4cab827d9d80460dde91bf734d41
3
+ size 191789
w2v_model.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:80420b2c39799879e6dae5ebbb71fa0b2740b9a8c9ae0699ac52263770ffe090
3
- size 7298780
 
 
 
 
w2v_model.model.wv.vectors.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9db84fed90d499bf7baaf423a26ffa04dd3c22c2cb1f62280f09e8709a50e80a
3
- size 86028128