Spaces:

Loren
/

api_search_articles

Sleeping

App Files Files Community

Loren commited on Oct 4

Commit

040fa4e

verified ·

1 Parent(s): ff865ff

Upload 5 files

Browse files

Files changed (3) hide show

app/database.py +57 -41
script/create_dataset.py +53 -0
script/create_sqlite_db.py +13 -20

app/database.py CHANGED Viewed

@@ -1,61 +1,77 @@
-import sqlite3
-from typing import List, Dict
 import os
 from huggingface_hub import hf_hub_download
-# Télécharger le fichier SQLite depuis le dataset
-# Créer un dossier temporaire pour le cache
-# Répertoire writable dans le Space
 cache_dir = "/tmp"
 os.makedirs(cache_dir, exist_ok=True)
-REPO_ID = "Loren/articles_db"  # dataset HF
-DB_NAME = 'articles.db'
-hf_token = os.environ["API_HF_TOKEN"]
-sqlite_path = hf_hub_download(
     repo_id=REPO_ID,
-    filename=DB_NAME,
     repo_type="dataset",
-    token=hf_token,
-    cache_dir=cache_dir
-)
-def get_connection(sqlite_path):
-    conn = sqlite3.connect(sqlite_path)
-    conn.row_factory = sqlite3.Row
-    return conn
 def fetch_tags() -> List[str]:
-    """Retourne tous les tags"""
-    conn = get_connection()
-    cur = conn.cursor()
-    cur.execute("SELECT tag_name FROM tags ORDER BY tag_name")
-    tags = [row["tag_name"] for row in cur.fetchall()]
-    conn.close()
-    return tags
 def fetch_articles_by_tags(tags: List[str]) -> List[Dict]:
     """
-    Retourne les articles correspondant aux tags.
     """
     if not tags:
         return []
-    conn = get_connection()
-    conn.row_factory = sqlite3.Row
-    cur = conn.cursor()
-    # Créer la liste de placeholders "?" dynamiquement
     placeholders = ",".join(["?"] * len(tags))
-    query = ("""SELECT a.article_id, a.article_title, a.article_url
-                  FROM tags t, articles a, tag_article ta
-                 WHERE ta.tag_id = t.tag_id
                    AND ta.article_id = a.article_id
-                   AND t.tag_name IN (""" + placeholders + """)"""
-    )
-    cur.execute(query, tags)
-    results = [dict(row) for row in cur.fetchall()]
-    conn.close()
-    return results

 import os
+from typing import List, Dict
+import duckdb
 from huggingface_hub import hf_hub_download
+# Initialisations
+REPO_ID = "Loren/articles_db"
 cache_dir = "/tmp"
 os.makedirs(cache_dir, exist_ok=True)
+# Téléchargement des fichiers Parquet depuis Hugging Face
+articles_parquet = hf_hub_download(
+    repo_id=REPO_ID,
+    filename="articles.parquet",
+    repo_type="dataset",
+    cache_dir=cache_dir)
+tags_parquet = hf_hub_download(
     repo_id=REPO_ID,
+    filename="tags.parquet",
     repo_type="dataset",
+    cache_dir=cache_dir)
+tag_article_parquet = hf_hub_download(
+    repo_id=REPO_ID,
+    filename="tag_article.parquet",
+    repo_type="dataset",
+    cache_dir=cache_dir)
+# Connexion DuckDB en mémoire
+con = duckdb.connect()
+# Créer des tables DuckDB directement à partir des fichiers Parquet
+con.execute(f"CREATE VIEW articles AS SELECT * FROM parquet_scan('{articles_parquet}')")
+con.execute(f"CREATE VIEW tags AS SELECT * FROM parquet_scan('{tags_parquet}')")
+con.execute(f"CREATE VIEW tag_article AS SELECT * FROM parquet_scan('{tag_article_parquet}')")
+# Fonctions d'accès aux données
 def fetch_tags() -> List[str]:
+    """
+    Récupère la liste de tous les tags disponibles dans la base de données.
+    Returns:
+        List[str]: Une liste de chaînes de caractères correspondant aux noms des tags, triés par ordre alphabétique.
+    """
+    query = "SELECT tag_name FROM tags ORDER BY tag_name"
+    result = con.execute(query).fetchall()
+    return [row[0] for row in result]
 def fetch_articles_by_tags(tags: List[str]) -> List[Dict]:
     """
+    Récupère les articles associés à un ou plusieurs tags.
+    Args:
+        tags (List[str]): Une liste de noms de tags pour filtrer les articles.
+    Returns:
+        List[Dict]: Une liste de dictionnaires, chacun représentant un article avec les clés:
+                    - 'article_id': ID de l'article
+                    - 'article_title': Titre de l'article
+                    - 'article_url': URL de l'article
+    Notes:
+        - Si la liste `tags` est vide, la fonction retourne une liste vide.
+        - Les résultats incluent uniquement les articles correspondant à au moins un des tags fournis.
     """
     if not tags:
         return []
     placeholders = ",".join(["?"] * len(tags))
+    query = f"""SELECT distinct a.article_id, a.article_title, a.article_url
+                  FROM tags t, tag_article ta, articles a
+                 WHERE t.tag_id = ta.tag_id
                    AND ta.article_id = a.article_id
+                   AND t.tag_name IN ({placeholders})
+             """
+    result = con.execute(query, tags).fetchdf()
+    return result.to_dict(orient="records")

script/create_dataset.py ADDED Viewed

	@@ -0,0 +1,53 @@

+##############################################################################################
+### Script de création des fichiers Parquet compressés à partir de la base de données SQLite
+### et upload dans un nouveau dataset HF : Loren/articles_database
+### 👉 Peuvent alors être utilisés par un space Hugging Face
+##############################################################################################
+import sqlite3
+import pandas as pd
+from pathlib import Path
+from huggingface_hub import upload_file
+import os
+from dotenv import load_dotenv
+# Initialisations
+print("Initialisations ...")
+load_dotenv()
+HF_TOKEN = os.environ["API_HF_TOKEN"]
+DATA_DIR = Path("../../Data")        # dossier local contenant articles.db
+SQLITE_FILE = DATA_DIR / "articles.db"
+REPO_ID = "Loren/articles_database"  # nouveau dataset HF
+# Connexion SQLite
+conn = sqlite3.connect(SQLITE_FILE)
+# Extraction des tables et conversion en Parquet compressé
+print("Création des fichiers Parquet compressés ...")
+tables = ["articles", "tags", "tag_article"]
+parquet_dir = DATA_DIR / "parquet_tables"
+parquet_dir.mkdir(exist_ok=True)
+parquet_files = []
+for table in tables:
+    df = pd.read_sql_query(f"SELECT * FROM {table}", conn)
+    parquet_path = parquet_dir / f"{table}.parquet"
+    df.to_parquet(parquet_path, engine="pyarrow", index=False, compression="snappy")
+    parquet_files.append(parquet_path)
+conn.close()
+# Upload des fichiers Parquet vers HF
+print("Upload des fichiers Parquet vers HF ...")
+for parquet_file in parquet_files:
+    print(f"Uploading {parquet_file.name} ...")
+    upload_file(
+        path_or_fileobj=parquet_file,
+        path_in_repo=parquet_file.name,
+        repo_id=REPO_ID,
+        repo_type="dataset",
+        token=HF_TOKEN
+    )
+print("Upload terminé ✅")

script/create_sqlite_db.py CHANGED Viewed

@@ -6,7 +6,7 @@
 ### df.to_parquet("medium_articles.parquet", engine="pyarrow", compression="snappy")
 ###
 ### Le fichier a été uploadé dans un dataset HF : Loren/articles_db
-### Le script crée la base SQLite articles.db et l'upload dans le dataset
 ##############################################################################################
 import sqlite3
@@ -80,14 +80,10 @@ print("Extraction des tags en une liste unique ...")
 df['list_tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
 # Extraire tous les tags uniques
 all_tags = list(set(itertools.chain.from_iterable(df['list_tags'])))
-# Exclusion de certains tags pour question de volume de la database (il faut= 1 Go)
-list_exclude = ['Politics', 'Startup', 'Covid 19', 'JavaScript', 'Business', 'Blockchain',
-                'Cryptocurrency', 'Bitcoin']
-list_tags = [t for t in all_tags if t not in list_exclude]
 # Insertion des tags dans la table
 print("Insertion des tags dans la table ...")
-cur.executemany("INSERT INTO tags (tag_name) VALUES (?)", [(tag,) for tag in list_tags])
 # Récupération des correspondances tag_name -> tag_id
 print("Récupération des correspondances tag_name -> tag_id ...")
@@ -108,23 +104,20 @@ for _, row in df.iterrows():
         except Exception:
             date_value = None
     # Association aux tags
-    ind_ok = True
     for tag_name in row['list_tags']:
-        try:
-            tag_id = dict_tag_map[tag_name]
-            cur.execute("INSERT INTO tag_article (article_id, tag_id) VALUES (?, ?)",
-                        (article_id, tag_id))
-        except:
-            ind_ok = False
-    if ind_ok:
-    # Insertion dans la table Articles
-        cur.execute("""
-            INSERT INTO articles (article_id, article_title, article_text, article_url, article_authors, article_date)
-            VALUES (?, ?, ?, ?, ?, ?)""",
-                (article_id, row["title"], row["text"], row["url"], row["authors"], date_value))
 conn.commit()
 conn.close()

 ### df.to_parquet("medium_articles.parquet", engine="pyarrow", compression="snappy")
 ###
 ### Le fichier a été uploadé dans un dataset HF : Loren/articles_db
+### Le script crée la base SQLite articles.db et l'upload dans le dataset HF
 ##############################################################################################
 import sqlite3
 df['list_tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
 # Extraire tous les tags uniques
 all_tags = list(set(itertools.chain.from_iterable(df['list_tags'])))
 # Insertion des tags dans la table
 print("Insertion des tags dans la table ...")
+cur.executemany("INSERT INTO tags (tag_name) VALUES (?)", [(tag,) for tag in all_tags])
 # Récupération des correspondances tag_name -> tag_id
 print("Récupération des correspondances tag_name -> tag_id ...")
         except Exception:
             date_value = None
+    # Insertion dans la table Articles
+    cur.execute("""
+        INSERT INTO articles (article_id, article_title, article_text, article_url, article_authors, article_date)
+        VALUES (?, ?, ?, ?, ?, ?)""",
+            (article_id, row["title"], row["text"], row["url"], row["authors"], date_value))
     # Association aux tags
     for tag_name in row['list_tags']:
+        tag_id = dict_tag_map[tag_name]
+        cur.execute("INSERT INTO tag_article (article_id, tag_id) VALUES (?, ?)",
+                    (article_id, tag_id))
+# Commit et fermeture de la connexion
+print("Commit et fermeture de la connexion ...")
 conn.commit()
 conn.close()