Spaces:

Loren
/

api_search_articles

Sleeping

App Files Files Community

Loren commited on Oct 3

Commit

dd0f502

verified ·

1 Parent(s): ee19ac7

Upload 6 files

Browse files

Files changed (3) hide show

app/database.py +61 -61
requirements.txt +5 -5
script/create_sqlite_db.py +40 -15

app/database.py CHANGED Viewed

@@ -1,61 +1,61 @@
-import sqlite3
-from typing import List, Dict
-import os
-from huggingface_hub import hf_hub_download
-# Télécharger le fichier SQLite depuis le dataset
-# Créer un dossier temporaire pour le cache
-# Répertoire writable dans le Space
-cache_dir = "/tmp"
-os.makedirs(cache_dir, exist_ok=True)
-REPO_ID = "Loren/articles_db"  # dataset HF
-DB_NAME = 'articles.db'
-hf_token = os.environ["API_HF_TOKEN"]
-sqlite_path = hf_hub_download(
-    repo_id=REPO_ID,
-    filename=DB_NAME,
-    repo_type="dataset",
-    token=hf_token,
-    cache_dir=cache_dir
-)
-def get_connection(sqlite_path):
-    conn = sqlite3.connect(sqlite_path)
-    conn.row_factory = sqlite3.Row
-    return conn
-def fetch_tags() -> List[str]:
-    """Retourne tous les tags"""
-    conn = get_connection()
-    cur = conn.cursor()
-    cur.execute("SELECT tag_name FROM tags ORDER BY tag_name")
-    tags = [row["tag_name"] for row in cur.fetchall()]
-    conn.close()
-    return tags
-def fetch_articles_by_tags(tags: List[str]) -> List[Dict]:
-    """
-    Retourne les articles correspondant aux tags.
-    """
-    if not tags:
-        return []
-    conn = get_connection()
-    conn.row_factory = sqlite3.Row
-    cur = conn.cursor()
-    # Créer la liste de placeholders "?" dynamiquement
-    placeholders = ",".join(["?"] * len(tags))
-    query = ("""SELECT a.article_id, a.article_title, a.article_url
-                  FROM tags t, articles a, tag_article ta
-                 WHERE ta.tag_id = t.tag_id
-                   AND ta.article_id = a.article_id
-                   AND t.tag_name IN (""" + placeholders + """)"""
-    )
-    cur.execute(query, tags)
-    results = [dict(row) for row in cur.fetchall()]
-    conn.close()
-    return results

+import sqlite3
+from typing import List, Dict
+import os
+from huggingface_hub import hf_hub_download
+# Télécharger le fichier SQLite depuis le dataset
+# Créer un dossier temporaire pour le cache
+# Répertoire writable dans le Space
+cache_dir = "/tmp"
+os.makedirs(cache_dir, exist_ok=True)
+REPO_ID = "Loren/articles_db"  # dataset HF
+DB_NAME = 'articles.db'
+hf_token = os.environ["API_HF_TOKEN"]
+sqlite_path = hf_hub_download(
+    repo_id=REPO_ID,
+    filename=DB_NAME,
+    repo_type="dataset",
+    token=hf_token,
+    cache_dir=cache_dir
+)
+def get_connection(sqlite_path):
+    conn = sqlite3.connect(sqlite_path)
+    conn.row_factory = sqlite3.Row
+    return conn
+def fetch_tags() -> List[str]:
+    """Retourne tous les tags"""
+    conn = get_connection()
+    cur = conn.cursor()
+    cur.execute("SELECT tag_name FROM tags ORDER BY tag_name")
+    tags = [row["tag_name"] for row in cur.fetchall()]
+    conn.close()
+    return tags
+def fetch_articles_by_tags(tags: List[str]) -> List[Dict]:
+    """
+    Retourne les articles correspondant aux tags.
+    """
+    if not tags:
+        return []
+    conn = get_connection()
+    conn.row_factory = sqlite3.Row
+    cur = conn.cursor()
+    # Créer la liste de placeholders "?" dynamiquement
+    placeholders = ",".join(["?"] * len(tags))
+    query = ("""SELECT a.article_id, a.article_title, a.article_url
+                  FROM tags t, articles a, tag_article ta
+                 WHERE ta.tag_id = t.tag_id
+                   AND ta.article_id = a.article_id
+                   AND t.tag_name IN (""" + placeholders + """)"""
+    )
+    cur.execute(query, tags)
+    results = [dict(row) for row in cur.fetchall()]
+    conn.close()
+    return results

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-fastapi==0.109.2
-uvicorn[standard]==0.23.2
-pandas==2.1.1
-pyarrow==12.0.1
-huggingface_hub

+fastapi==0.109.2
+uvicorn[standard]==0.23.2
+pandas==2.1.1
+pyarrow==12.0.1
+huggingface_hub==0.18.1

script/create_sqlite_db.py CHANGED Viewed

@@ -1,6 +1,18 @@
 import sqlite3
 import pandas as pd
 import os
 import itertools
 import ast
 import uuid
@@ -9,11 +21,15 @@ from pathlib import Path
 # Initialisations
 print("Initialisations ...")
-hf_token = os.environ["API_HF_TOKEN"]
-DATA_DIR = Path("../data")   # dossier parent du script
-parquet_path = hf_hub_download(repo_id="Loren/articles_data",
-                               filename="mon_fichier.parquet")
 REPO_ID = "Loren/articles_db"  # dataset HF
 DB_NAME = 'articles.db'
 SQLITE_FILE = DATA_DIR / DB_NAME
 # Créer le dossier data s'il n'existe pas
@@ -64,10 +80,14 @@ print("Extraction des tags en une liste unique ...")
 df['list_tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
 # Extraire tous les tags uniques
 all_tags = list(set(itertools.chain.from_iterable(df['list_tags'])))
 # Insertion des tags dans la table
 print("Insertion des tags dans la table ...")
-cur.executemany("INSERT INTO tags (tag_name) VALUES (?)", [(tag,) for tag in all_tags])
 # Récupération des correspondances tag_name -> tag_id
 print("Récupération des correspondances tag_name -> tag_id ...")
@@ -88,17 +108,22 @@ for _, row in df.iterrows():
         except Exception:
             date_value = None
-    # Insertion dans la table Articles
-    cur.execute("""
-        INSERT INTO articles (article_id, article_title, article_text, article_url, article_authors, article_date)
-        VALUES (?, ?, ?, ?, ?, ?)""",
-               (article_id, row["title"], row["text"], row["url"], row["authors"], date_value))
     # Association aux tags
     for tag_name in row['list_tags']:
-        tag_id = dict_tag_map[tag_name]
-        cur.execute("INSERT INTO tag_article (article_id, tag_id) VALUES (?, ?)",
-                    (article_id, tag_id))
 conn.commit()
 conn.close()
@@ -110,7 +135,7 @@ upload_file(
     path_in_repo=DB_NAME,
     repo_id=REPO_ID,
     repo_type="dataset",
-    token=hf_token
 )
 print("Traitement terminé.")

+##############################################################################################
+### Script de création de la base de données articles à partir du fichier parquet,
+### correspondant au jeu d'essai : https://www.kaggle.com/code/fabiochiusano/medium-articles-simple-data-analysis
+### Téléchargement du csv puis conversion en Parquet avec compression snappy :
+### df = pd.read_csv("medium_articles.csv")
+### df.to_parquet("medium_articles.parquet", engine="pyarrow", compression="snappy")
+###
+### Le fichier a été uploadé dans un dataset HF : Loren/articles_db
+### Le script crée la base SQLite articles.db et l'upload dans le dataset
+##############################################################################################
 import sqlite3
 import pandas as pd
 import os
+from dotenv import load_dotenv
 import itertools
 import ast
 import uuid
 # Initialisations
 print("Initialisations ...")
+load_dotenv()
+HF_TOKEN = os.getenv('API_HF_TOKEN')
+DATA_DIR = Path("../../Data")   # dossier parent du script
 REPO_ID = "Loren/articles_db"  # dataset HF
+parquet_path = hf_hub_download(repo_id=REPO_ID,
+                               filename="medium_articles.parquet",
+                               repo_type="dataset")
 DB_NAME = 'articles.db'
 SQLITE_FILE = DATA_DIR / DB_NAME
 # Créer le dossier data s'il n'existe pas
 df['list_tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
 # Extraire tous les tags uniques
 all_tags = list(set(itertools.chain.from_iterable(df['list_tags'])))
+# Exclusion de certains tags pour question de volume de la database (il faut= 1 Go)
+list_exclude = ['Politics', 'Startup', 'Covid 19', 'JavaScript', 'Business', 'Blockchain',
+                'Cryptocurrency', 'Bitcoin']
+list_tags = [t for t in all_tags if t not in list_exclude]
 # Insertion des tags dans la table
 print("Insertion des tags dans la table ...")
+cur.executemany("INSERT INTO tags (tag_name) VALUES (?)", [(tag,) for tag in list_tags])
 # Récupération des correspondances tag_name -> tag_id
 print("Récupération des correspondances tag_name -> tag_id ...")
         except Exception:
             date_value = None
     # Association aux tags
+    ind_ok = True
     for tag_name in row['list_tags']:
+        try:
+            tag_id = dict_tag_map[tag_name]
+            cur.execute("INSERT INTO tag_article (article_id, tag_id) VALUES (?, ?)",
+                        (article_id, tag_id))
+        except:
+            ind_ok = False
+    if ind_ok:
+    # Insertion dans la table Articles
+        cur.execute("""
+            INSERT INTO articles (article_id, article_title, article_text, article_url, article_authors, article_date)
+            VALUES (?, ?, ?, ?, ?, ?)""",
+                (article_id, row["title"], row["text"], row["url"], row["authors"], date_value))
 conn.commit()
 conn.close()
     path_in_repo=DB_NAME,
     repo_id=REPO_ID,
     repo_type="dataset",
+    token=HF_TOKEN
 )
 print("Traitement terminé.")