Moncey10 commited on
Commit
4112bd3
·
verified ·
1 Parent(s): 3690686

Upload 15 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/creditcard.csv filter=lfs diff=lfs merge=lfs -text
37
+ data/training.csv filter=lfs diff=lfs merge=lfs -text
data/creditcard.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76274b691b16a6c49d3f159c883398e03ccd6d1ee12d9d8ee38f4b4b98551a89
3
+ size 150828752
data/training.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08cba6317a49528fcd074f9043aafcd5ad6c6be45ede159c4e36cec33af24afe
3
+ size 238803811
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (153 Bytes). View file
 
src/data/__pycache__/preprocess.cpython-313.pyc ADDED
Binary file (1.03 kB). View file
 
src/data/preprocess.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from nltk.corpus import stopwords
3
+ from nltk.stem.porter import PorterStemmer
4
+
5
+ port_stem = PorterStemmer()
6
+ stop_words = set(stopwords.words('english'))
7
+
8
+ def clean_text(text: str) -> str:
9
+ text = re.sub('[^a-zA-Z]', ' ', str(text))
10
+ words = text.lower().split()
11
+
12
+ words = [
13
+ port_stem.stem(word)
14
+ for word in words
15
+ if word not in stop_words
16
+ ]
17
+
18
+ return ' '.join(words)
19
+
src/model/infernce.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import pandas as pd
3
+
4
+ class SentimentModel:
5
+
6
+ def __init__(self, model_path='artifacts/model.pkl'):
7
+ with open(model_path, 'rb') as f:
8
+ self.model = pickle.load(f)
9
+
10
+ def predict(self, text: str):
11
+ pred = self.model.predict([text])[0]
12
+ return "Positive" if pred == 1 else "Negative"
src/model/train.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pickle
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import accuracy_score
5
+
6
+ from src.pipeline.build_pipeline import build_pipeline
7
+
8
+
9
+ def train_model():
10
+ columns = ['target', 'id', 'date','flag','user','text']
11
+
12
+ data = pd.read_csv(
13
+ 'data/training.csv',
14
+ names=columns,
15
+ encoding='ISO-8859-1'
16
+ )
17
+
18
+ data.replace({'target': {4: 1}}, inplace=True)
19
+
20
+
21
+ data = data.sample(1000, random_state=42)
22
+
23
+ X = data['text']
24
+ y = data['target']
25
+
26
+ x_train, x_test, y_train, y_test = train_test_split(
27
+ X, y, test_size=0.2, stratify=y, random_state=2
28
+ )
29
+
30
+ pipeline = build_pipeline()
31
+
32
+ pipeline.fit(x_train, y_train)
33
+
34
+ print("Train Accuracy:", accuracy_score(y_train, pipeline.predict(x_train)))
35
+ print("Test Accuracy:", accuracy_score(y_test, pipeline.predict(x_test)))
36
+
37
+
38
+ with open('artifacts/model.pkl', 'wb') as f:
39
+ pickle.dump(pipeline, f)
40
+
41
+ if __name__ == "__main__":
42
+ train_model()
43
+
44
+
45
+
46
+
src/models/__pycache__/inference.cpython-313.pyc ADDED
Binary file (1.13 kB). View file
 
src/models/__pycache__/train.cpython-313.pyc ADDED
Binary file (1.7 kB). View file
 
src/models/inference.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import pandas as pd
3
+
4
+ class SentimentModel:
5
+
6
+ def __init__(self, model_path='artifacts/model.pkl'):
7
+ with open(model_path, 'rb') as f:
8
+ self.model = pickle.load(f)
9
+
10
+ def predict(self, text: str):
11
+ pred = self.model.predict([text])[0]
12
+ return "Positive" if pred == 1 else "Negative"
src/models/train.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pickle
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import accuracy_score
5
+
6
+ from src.pipeline.build_pipeline import build_pipeline
7
+
8
+
9
+ def train_model():
10
+ columns = ['target', 'id', 'date','flag','user','text']
11
+
12
+ data = pd.read_csv(
13
+ 'data/training.csv',
14
+ names=columns,
15
+ encoding='ISO-8859-1'
16
+ )
17
+
18
+ data.replace({'target': {4: 1}}, inplace=True)
19
+
20
+
21
+ data = data.sample(1000, random_state=42)
22
+
23
+ X = data['text']
24
+ y = data['target']
25
+
26
+ x_train, x_test, y_train, y_test = train_test_split(
27
+ X, y, test_size=0.2, stratify=y, random_state=2
28
+ )
29
+
30
+ pipeline = build_pipeline()
31
+
32
+ pipeline.fit(x_train, y_train)
33
+
34
+ print("Train Accuracy:", accuracy_score(y_train, pipeline.predict(x_train)))
35
+ print("Test Accuracy:", accuracy_score(y_test, pipeline.predict(x_test)))
36
+
37
+
38
+ with open('artifacts/model.pkl', 'wb') as f:
39
+ pickle.dump(pipeline, f)
40
+ # testing
41
+ # def test_ok():
42
+ # hdshsdhfs
43
+ if __name__ == "__main__":
44
+ train_model()
45
+
src/pipeline/__pycache__/build_pipeline.cpython-313.pyc ADDED
Binary file (818 Bytes). View file
 
src/pipeline/buid_pipelne.py ADDED
File without changes
src/pipeline/build_pipeline.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.pipeline import Pipeline
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.linear_model import LogisticRegression
4
+
5
+ from src.data.preprocess import clean_text
6
+
7
+
8
+
9
+ def build_pipeline():
10
+ pipeline = Pipeline([
11
+ ('tfidf', TfidfVectorizer(preprocessor=clean_text)),
12
+ ('model', LogisticRegression(max_iter=1000))
13
+ ])
14
+
15
+ return pipeline