JonnyBP commited on
Commit
f46289d
·
1 Parent(s): 3438ebf

feat: add preprocessing. #3

Browse files
.gitignore ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+ .Python
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ *.egg
11
+
12
+ # Entornos virtuales
13
+ .env
14
+ .venv
15
+ env/
16
+ venv/
17
+ ENV/
18
+
19
+ # Jupyter
20
+ .ipynb_checkpoints/
21
+ *.ipynb_checkpoints
22
+
23
+ # Datos (no subir datos al repo)
24
+ data/raw/*
25
+ data/processed/*
26
+ !data/raw/.gitkeep
27
+ !data/processed/.gitkeep
28
+
29
+ # Modelos entrenados
30
+ models/*
31
+ !models/.gitkeep
32
+
33
+ # Logs
34
+ logs/*
35
+ !logs/.gitkeep
36
+
37
+ # Variables de entorno
38
+ .env
39
+ *.env
40
+ secrets.yaml
41
+
42
+ # IDEs
43
+ .vscode/settings.json
44
+ .idea/
45
+ *.swp
46
+ *.swo
47
+
48
+ # OS
49
+ .DS_Store
50
+ Thumbs.db
51
+
52
+ # MLflow
53
+ mlruns/
54
+ mlartifacts/
55
+
56
+ #jony
57
+ 02_preprocessing_v2.ipynb
configs/features.yaml CHANGED
@@ -12,11 +12,11 @@ preprocessing:
12
  vectorization:
13
  method: tfidf # tfidf | bow | both
14
  tfidf:
15
- max_features: 10000
16
  ngram_range: [1, 2]
17
  sublinear_tf: true
18
- min_df: 2
19
  bow:
20
- max_features: 10000
21
  ngram_range: [1, 1]
22
- min_df: 2
 
12
  vectorization:
13
  method: tfidf # tfidf | bow | both
14
  tfidf:
15
+ max_features: 5000
16
  ngram_range: [1, 2]
17
  sublinear_tf: true
18
+ min_df: 3
19
  bow:
20
+ max_features: 5000
21
  ngram_range: [1, 1]
22
+ min_df: 3
configs/pipeline.yaml CHANGED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pipeline:
2
+ random_state: 42
3
+ test_size: 0.2
4
+ cv_folds: 5
5
+
6
+ data:
7
+ raw_path: data/raw/youtoxic_english_1000.csv
8
+ processed_path: data/processed/v1/comments_with_stats.csv
9
+ target_binary: IsToxic
10
+ target_multilabel:
11
+ - IsAbusive
12
+ - IsProvocative
13
+ - IsHatespeech
14
+ - IsRacist
15
+ - IsObscene
16
+ text_column: Text
17
+ id_column: CommentId
18
+
19
+ mode: binary # binary | multilabel
notebooks/01_eda_v2.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
reports/v2/07_tokens_comparativa.png ADDED

Git LFS Details

  • SHA256: ec0a6ea1ef17dca1a15d8ccd4146d6db93992e18116d74251e89f146c47d6b5d
  • Pointer size: 130 Bytes
  • Size of remote file: 66.1 kB