Spaces:
Sleeping
Sleeping
Upload create_dataset.py
Browse files- script/create_dataset.py +18 -13
script/create_dataset.py
CHANGED
|
@@ -31,20 +31,25 @@ from collections import Counter
|
|
| 31 |
print("Initialisations ...")
|
| 32 |
load_dotenv()
|
| 33 |
HF_TOKEN = os.getenv('API_HF_TOKEN')
|
|
|
|
|
|
|
|
|
|
| 34 |
DATA_DIR = Path("../../Data") # dossier parent du script
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
|
|
|
| 38 |
filename="medium_articles.parquet",
|
| 39 |
repo_type="dataset")
|
| 40 |
|
| 41 |
-
|
| 42 |
-
SQLITE_FILE = DATA_DIR / DB_NAME
|
| 43 |
-
# Créer le dossier data s'il n'existe pas
|
| 44 |
DATA_DIR.mkdir(exist_ok=True)
|
| 45 |
-
|
| 46 |
-
parquet_dir = DATA_DIR / "parquet_tables"
|
| 47 |
-
parquet_dir.mkdir(exist_ok=True)
|
| 48 |
|
| 49 |
# Chargement des données
|
| 50 |
print("Chargement des données ...")
|
|
@@ -94,7 +99,7 @@ all_tags = list(itertools.chain.from_iterable(df['list_tags']))
|
|
| 94 |
# Comptage du nombre d'occurrences de chaque tag
|
| 95 |
tag_counts = Counter(all_tags)
|
| 96 |
# On ne va conserver que les tags avec au moins 100 occurrences
|
| 97 |
-
list_tags = [tag for tag, count in tag_counts.items() if count >=
|
| 98 |
|
| 99 |
# Insertion des tags dans la table
|
| 100 |
print("Insertion des tags dans la table ...")
|
|
@@ -135,7 +140,7 @@ for _, row in df.iterrows():
|
|
| 135 |
pass
|
| 136 |
|
| 137 |
print("-> ", len(list_tags), " tags")
|
| 138 |
-
cur.execute("SELECT COUNT(*) FROM
|
| 139 |
nb_lignes = cur.fetchone()[0]
|
| 140 |
print("-> ", nb_lignes, " associations articles <-> tags")
|
| 141 |
print("-> ", len(df), " articles")
|
|
@@ -149,7 +154,7 @@ print("Upload base Sqlite dans le dataset hugging face ...")
|
|
| 149 |
upload_file(
|
| 150 |
path_or_fileobj=SQLITE_FILE,
|
| 151 |
path_in_repo=DB_NAME,
|
| 152 |
-
repo_id=
|
| 153 |
repo_type="dataset",
|
| 154 |
token=HF_TOKEN
|
| 155 |
)
|
|
@@ -157,9 +162,9 @@ upload_file(
|
|
| 157 |
# Création des fichiers Parquet compressés
|
| 158 |
print("Création des fichiers Parquet compressés ...")
|
| 159 |
parquet_files = []
|
| 160 |
-
for table in
|
| 161 |
df = pd.read_sql_query(f"SELECT * FROM {table}", conn)
|
| 162 |
-
parquet_path =
|
| 163 |
df.to_parquet(parquet_path, engine="pyarrow", index=False, compression="snappy")
|
| 164 |
parquet_files.append(parquet_path)
|
| 165 |
|
|
|
|
| 31 |
print("Initialisations ...")
|
| 32 |
load_dotenv()
|
| 33 |
HF_TOKEN = os.getenv('API_HF_TOKEN')
|
| 34 |
+
|
| 35 |
+
# Constantes
|
| 36 |
+
MIN_COUNT = 5 # nombre minimum d'occurrences pour qu'un tag soit conservé
|
| 37 |
DATA_DIR = Path("../../Data") # dossier parent du script
|
| 38 |
+
REPO_ID_DB = "Loren/articles_db" # dataset HF
|
| 39 |
+
REPO_ID = "Loren/articles_database" # dataset HF
|
| 40 |
+
DB_NAME = 'articles.db'
|
| 41 |
+
SQLITE_FILE = DATA_DIR / DB_NAME
|
| 42 |
+
LIST_TABLES = ["articles", "tags", "tag_article"]
|
| 43 |
+
PARQUET_DIR = DATA_DIR / "parquet_tables"
|
| 44 |
|
| 45 |
+
# Chargement des données
|
| 46 |
+
parquet_path = hf_hub_download(repo_id=REPO_ID_DB,
|
| 47 |
filename="medium_articles.parquet",
|
| 48 |
repo_type="dataset")
|
| 49 |
|
| 50 |
+
# Créer les dossiers s'ils n'existent pas
|
|
|
|
|
|
|
| 51 |
DATA_DIR.mkdir(exist_ok=True)
|
| 52 |
+
PARQUET_DIR.mkdir(exist_ok=True)
|
|
|
|
|
|
|
| 53 |
|
| 54 |
# Chargement des données
|
| 55 |
print("Chargement des données ...")
|
|
|
|
| 99 |
# Comptage du nombre d'occurrences de chaque tag
|
| 100 |
tag_counts = Counter(all_tags)
|
| 101 |
# On ne va conserver que les tags avec au moins 100 occurrences
|
| 102 |
+
list_tags = [tag for tag, count in tag_counts.items() if count >= MIN_COUNT]
|
| 103 |
|
| 104 |
# Insertion des tags dans la table
|
| 105 |
print("Insertion des tags dans la table ...")
|
|
|
|
| 140 |
pass
|
| 141 |
|
| 142 |
print("-> ", len(list_tags), " tags")
|
| 143 |
+
cur.execute("SELECT COUNT(*) FROM tag_article")
|
| 144 |
nb_lignes = cur.fetchone()[0]
|
| 145 |
print("-> ", nb_lignes, " associations articles <-> tags")
|
| 146 |
print("-> ", len(df), " articles")
|
|
|
|
| 154 |
upload_file(
|
| 155 |
path_or_fileobj=SQLITE_FILE,
|
| 156 |
path_in_repo=DB_NAME,
|
| 157 |
+
repo_id=REPO_ID_DB,
|
| 158 |
repo_type="dataset",
|
| 159 |
token=HF_TOKEN
|
| 160 |
)
|
|
|
|
| 162 |
# Création des fichiers Parquet compressés
|
| 163 |
print("Création des fichiers Parquet compressés ...")
|
| 164 |
parquet_files = []
|
| 165 |
+
for table in LIST_TABLES:
|
| 166 |
df = pd.read_sql_query(f"SELECT * FROM {table}", conn)
|
| 167 |
+
parquet_path = PARQUET_DIR / f"{table}.parquet"
|
| 168 |
df.to_parquet(parquet_path, engine="pyarrow", index=False, compression="snappy")
|
| 169 |
parquet_files.append(parquet_path)
|
| 170 |
|