Spaces:
Sleeping
Sleeping
| ############################################################################################## | |
| ### Script de création des fichiers Parquet compressés à partir de la base de données SQLite | |
| ### et upload dans un nouveau dataset HF : Loren/articles_database | |
| ### 👉 Peuvent alors être utilisés par un space Hugging Face | |
| ############################################################################################## | |
| import sqlite3 | |
| import pandas as pd | |
| from pathlib import Path | |
| from huggingface_hub import upload_file | |
| import os | |
| from dotenv import load_dotenv | |
| # Initialisations | |
| print("Initialisations ...") | |
| load_dotenv() | |
| HF_TOKEN = os.environ["API_HF_TOKEN"] | |
| DATA_DIR = Path("../../Data") # dossier local contenant articles.db | |
| SQLITE_FILE = DATA_DIR / "articles.db" | |
| REPO_ID = "Loren/articles_database" # nouveau dataset HF | |
| # Connexion SQLite | |
| conn = sqlite3.connect(SQLITE_FILE) | |
| # Extraction des tables et conversion en Parquet compressé | |
| print("Création des fichiers Parquet compressés ...") | |
| tables = ["articles", "tags", "tag_article"] | |
| parquet_dir = DATA_DIR / "parquet_tables" | |
| parquet_dir.mkdir(exist_ok=True) | |
| parquet_files = [] | |
| for table in tables: | |
| df = pd.read_sql_query(f"SELECT * FROM {table}", conn) | |
| parquet_path = parquet_dir / f"{table}.parquet" | |
| df.to_parquet(parquet_path, engine="pyarrow", index=False, compression="snappy") | |
| parquet_files.append(parquet_path) | |
| conn.close() | |
| # Upload des fichiers Parquet vers HF | |
| print("Upload des fichiers Parquet vers HF ...") | |
| for parquet_file in parquet_files: | |
| print(f"Uploading {parquet_file.name} ...") | |
| upload_file( | |
| path_or_fileobj=parquet_file, | |
| path_in_repo=parquet_file.name, | |
| repo_id=REPO_ID, | |
| repo_type="dataset", | |
| token=HF_TOKEN | |
| ) | |
| print("Upload terminé ✅") | |