LorenzoBioinfo commited on
Commit
4e05a46
·
1 Parent(s): 66028cc

Add loading dataset

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. src/app.py +8 -12
  3. src/data_preparation.py +30 -13
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  env
2
  data/raw/tweet_eval_sentiment
3
- data/raw/youtube-comment-sentiment
 
 
1
  env
2
  data/raw/tweet_eval_sentiment
3
+ data/raw/youtube-comment-sentiment
4
+ data/processed
src/app.py CHANGED
@@ -7,6 +7,7 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
  from datasets import load_dataset, load_from_disk
8
  import torch
9
  import random
 
10
 
11
  # Caricamento del modello e dei dati se già scaricati
12
  MODEL= "cardiffnlp/twitter-roberta-base-sentiment-latest"
@@ -20,23 +21,18 @@ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
20
  labels = ["negative", "neutral", "positive"]
21
 
22
 
23
- if not os.path.exists(TWEET_PROCESSED_PATH):
24
- tweet_eval = load_dataset("tweet_eval", "sentiment")
25
- raise FileNotFoundError(
26
- f"Dati non trovati in {TWEET_PROCESSED_PATH}. "
27
- "Esegui src/data_preparation.py per crearlo."
28
- )
29
 
 
 
 
 
30
  tweet_eval = load_from_disk(TWEET_PROCESSED_PATH)
31
 
32
 
 
33
  if not os.path.exists(YT_PROCESSED_PATH):
34
- youtube_ds = load_dataset("AmaanP314/youtube-comment-sentiment")
35
- raise FileNotFoundError(
36
- f"Dati non trovati in {YT_PROCESSED_PATH}. "
37
- "Esegui src/data_preparation.py per crearlo."
38
- )
39
-
40
  youtube_ds = load_from_disk(YT_PROCESSED_PATH)
41
 
42
  app = FastAPI(
 
7
  from datasets import load_dataset, load_from_disk
8
  import torch
9
  import random
10
+ import subprocess
11
 
12
  # Caricamento del modello e dei dati se già scaricati
13
  MODEL= "cardiffnlp/twitter-roberta-base-sentiment-latest"
 
21
  labels = ["negative", "neutral", "positive"]
22
 
23
 
 
 
 
 
 
 
24
 
25
+ # TWEET EVAL
26
+ if not os.path.exists(TWEET_PROCESSED_PATH):
27
+ print(f"Dataset Tweet Eval non trovato in {TWEET_PROCESSED_PATH}. Lo genero...")
28
+ subprocess.run(["python", "src/data_preparation.py", "tweet_eval"], check=True)
29
  tweet_eval = load_from_disk(TWEET_PROCESSED_PATH)
30
 
31
 
32
+ # YOUTUBE COMMENTS
33
  if not os.path.exists(YT_PROCESSED_PATH):
34
+ print(f" Dataset YouTube non trovato in {YT_PROCESSED_PATH}. Lo genero...")
35
+ subprocess.run(["python", "src/data_preparation.py", "youtube"], check=True)
 
 
 
 
36
  youtube_ds = load_from_disk(YT_PROCESSED_PATH)
37
 
38
  app = FastAPI(
src/data_preparation.py CHANGED
@@ -1,5 +1,6 @@
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer
 
3
  import re
4
  import os
5
 
@@ -33,19 +34,6 @@ def map_label(label):
33
  return label
34
 
35
 
36
- # Download tweet_eval
37
- tweet_eval = load_dataset("tweet_eval", "sentiment")
38
- # Download youtub comment dataset
39
- youtube = load_dataset("AmaanP314/youtube-comment-sentiment")
40
-
41
-
42
- tweet_eval = tweet_eval.map(lambda x: {"text": clean_text(x["text"])})
43
- youtube = youtube.map(lambda x: {"text": clean_text(x["CommentText"])})
44
-
45
-
46
- youtube = youtube.map(lambda x: {"label": map_label(x["Sentiment"])})
47
-
48
-
49
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
50
 
51
  def tokenize_function(examples):
@@ -61,3 +49,32 @@ youtube_tokenized = youtube.map(tokenize_function, batched=True)
61
 
62
  tweet_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "tweet_eval_tokenized"))
63
  youtube_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "youtube_tokenized"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer
3
+ import argparse
4
  import re
5
  import os
6
 
 
34
  return label
35
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
38
 
39
  def tokenize_function(examples):
 
49
 
50
  tweet_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "tweet_eval_tokenized"))
51
  youtube_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "youtube_tokenized"))
52
+
53
+ def prepare_tweet_eval(tokenizer, output_path):
54
+ print("Scarico e preparo il dataset Tweet Eval...")
55
+ ds = load_dataset("tweet_eval", "sentiment")
56
+ ds=ds.map(lambda x: {"text": clean_text(x["text"])})
57
+ ds=ds.map(tokenize_function, batched=True)
58
+ ds.save_to_disk(output_path)
59
+ print(f"Dataset Tweet Eval salvato in {output_path}")
60
+
61
+ def prepare_youtube(tokenizer, output_path):
62
+ print("Scarico e preparo il dataset YouTube Comments...")
63
+ ds = load_dataset("AmaanP314/youtube-comment-sentiment")
64
+ ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
65
+ ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
66
+ ds.save_to_disk(output_path)
67
+ print(f"Dataset YouTube salvato in {output_path}")
68
+
69
+
70
+ if __name__ == "__main__":
71
+ parser = argparse.ArgumentParser(description="Prepara dataset per sentiment analysis.")
72
+ parser.add_argument("dataset", choices=["tweet_eval", "youtube"], help="Nome del dataset da preparare.")
73
+ args = parser.parse_args()
74
+
75
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
76
+
77
+ if args.dataset == "tweet_eval":
78
+ prepare_tweet_eval(tokenizer, "data/processed/tweet_eval_tokenized")
79
+ elif args.dataset == "youtube":
80
+ prepare_youtube(tokenizer, "data/processed/youtube_comments")