Spaces:
Sleeping
Sleeping
| import json | |
| from pathlib import Path | |
| from functools import lru_cache | |
| from typing import Mapping, Union, List | |
| import re | |
| import pandas as pd | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # --------------------------------------------------------------------- | |
| # Paths + device | |
| # --------------------------------------------------------------------- | |
| PROJECT_ROOT = Path(__file__).resolve().parents[2] | |
| MODEL_DIR = PROJECT_ROOT / "config" | |
| # Dossier de ton modèle finetuné d'exécution | |
| # EXEC_MODEL_DIR = MODEL_DIR / "transformer_execution_generator_v3" | |
| # Chargement du modèle HF (tokenizer + modèle) | |
| MODEL_REPO = "AIppyDev/transformer_execution_generator_v3" | |
| MODEL_SUBFOLDER = "transformer_execution_generator_v3" # le nom du dossier dans le repo | |
| REPORT_PATH = MODEL_DIR / "execution_generator_model_report.json" | |
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Nombre max de tokens générés (équivalent MAX_LENGTH du notebook) | |
| EXEC_MAX_NEW_TOKENS = 180 | |
| # --------------------------------------------------------------------- | |
| # Construction du prompt pour un programme | |
| # --------------------------------------------------------------------- | |
| def build_execution_prompt(row: Union[pd.Series, Mapping, None]) -> str: | |
| """ | |
| Construit le prompt d'exécution à partir d'une ligne de programme. | |
| Format : | |
| The {exercise_name} exercise for "{target_muscles}" with "{equipment}" on "{difficulty}" level: | |
| """ | |
| if row is None: | |
| return "" | |
| def get_val(key: str, default: str = "") -> str: | |
| if isinstance(row, pd.Series): | |
| val = row.get(key, default) | |
| else: | |
| val = row.get(key, default) if hasattr(row, "get") else default | |
| if pd.isna(val): | |
| return default | |
| return str(val).strip() | |
| name = get_val("exercise_name", "this exercise") | |
| muscle = get_val("target_muscles", "the target muscles") | |
| equipment = get_val("equipment", "bodyweight") | |
| difficulty = get_val("difficulty", "General") | |
| prompt = ( | |
| f'The {name} exercise for ' | |
| f'"{muscle}" with ' | |
| f'"{equipment}" on ' | |
| f'"{difficulty}" level:' | |
| ) | |
| return prompt | |
| # --------------------------------------------------------------------- | |
| # Utils de post-traitement | |
| # --------------------------------------------------------------------- | |
| def strip_prompt_from_generations(prompts: List[str], generated_outputs: List[str]) -> List[str]: | |
| """ | |
| Supprime le prompt au début de chaque génération si le modèle l'a recopié. | |
| """ | |
| cleaned = [] | |
| for prompt, gen in zip(prompts, generated_outputs): | |
| gen_strip = gen.strip() | |
| if gen_strip.startswith(prompt): | |
| cleaned.append(gen_strip[len(prompt):].strip()) | |
| else: | |
| cleaned.append(gen_strip) | |
| return cleaned | |
| def trim_to_last_full_sentence(text: str) -> str: | |
| """ | |
| Coupe le texte à la dernière phrase complète. | |
| On garde tout jusqu'au dernier '.', '!' ou '?'. | |
| Si aucun n'est trouvé, on renvoie le texte brut stripé. | |
| """ | |
| text = text.strip() | |
| last_dot = text.rfind(".") | |
| last_excl = text.rfind("!") | |
| last_q = text.rfind("?") | |
| last_punct = max(last_dot, last_excl, last_q) | |
| if last_punct != -1: | |
| return text[: last_punct + 1].strip() | |
| return text | |
| def keep_first_n_sentences(text: str, n: int = 2) -> str: | |
| """ | |
| Garde seulement les n premières phrases (séparation grossière sur . ! ?). | |
| """ | |
| sentences = re.split(r'([.!?])', text) | |
| chunks = [] | |
| count = 0 | |
| for i in range(0, len(sentences) - 1, 2): | |
| sentence = sentences[i].strip() | |
| punct = sentences[i + 1].strip() | |
| if sentence: | |
| chunks.append(sentence + punct) | |
| count += 1 | |
| if count >= n: | |
| break | |
| return " ".join(chunks).strip() if chunks else text.strip() | |
| def dedupe_muscle_list(text: str) -> str: | |
| """ | |
| Détecte les listes du type 'glutes, hamstrings, and hamstrings' | |
| dans la phrase, les déduplique et les reformate proprement. | |
| """ | |
| # Regex qui capture une liste séparée par virgules ou 'and' | |
| # Exemple capturé : "glutes, hamstrings, and hamstrings" | |
| pattern = r"([A-Za-z ]+(?:,\s*[A-Za-z ]+)*(?:\s+and\s+[A-Za-z ]+)?)" | |
| def process_match(match): | |
| segment = match.group(0) | |
| # Split sur virgules et 'and' | |
| parts = re.split(r",|\band\b", segment) | |
| parts = [p.strip() for p in parts if p.strip()] | |
| # Déduplique tout en gardant l’ordre | |
| seen = set() | |
| unique = [] | |
| for p in parts: | |
| if p.lower() not in seen: | |
| seen.add(p.lower()) | |
| unique.append(p) | |
| # Reconstruction naturelle | |
| if len(unique) == 1: | |
| return unique[0] | |
| if len(unique) == 2: | |
| return f"{unique[0]} and {unique[1]}" | |
| return ", ".join(unique[:-1]) + f", and {unique[-1]}" | |
| # Applique la fonction sur toutes les occurrences | |
| cleaned = re.sub(pattern, process_match, text) | |
| return cleaned | |
| def clean_execution_text(text: str) -> str: | |
| """ | |
| Nettoyage final : espaces, 'reps', etc. | |
| """ | |
| if not isinstance(text, str): | |
| return text | |
| cleaned = text | |
| # 1) Ajouter un espace après un point si collé à une majuscule | |
| cleaned = re.sub(r'(\.)([A-Z])', r'\1 \2', cleaned) | |
| # 2) Ajouter un espace après "such as" si collé à un chiffre | |
| cleaned = re.sub(r"(such as)(\d)", r"\1 \2", cleaned) | |
| # 3) Ajouter un espace avant "reps" si collé au chiffre | |
| cleaned = re.sub(r"(\d)(reps)", r"\1 reps", cleaned) | |
| # 4) Normaliser les doubles espaces | |
| cleaned = re.sub(r"\s{2,}", " ", cleaned).strip() | |
| return cleaned | |
| # --------------------------------------------------------------------- | |
| # Chargement du modèle HF (tokenizer + modèle) | |
| # --------------------------------------------------------------------- | |
| def _load_exec_model(): | |
| """ | |
| Charge une seule fois tokenizer + modèle pour l'execution generator. | |
| Utilise un cache pour éviter les rechargements coûteux. | |
| """ | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO, | |
| subfolder=MODEL_SUBFOLDER,) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_REPO, | |
| subfolder=MODEL_SUBFOLDER, | |
| torch_dtype=torch.float32, # CPU friendly | |
| ) | |
| model.to(DEVICE) | |
| model.eval() | |
| return tokenizer, model | |
| # --------------------------------------------------------------------- | |
| # Génération de texte pour l'exécution (1 prompt, pipeline complet) | |
| # --------------------------------------------------------------------- | |
| def generate_execution_text( | |
| prompt: str, | |
| max_new_tokens: int = EXEC_MAX_NEW_TOKENS, | |
| temperature: float = 0.8, | |
| top_k: int = 250, | |
| top_p: float = 0.92, | |
| ) -> str: | |
| """ | |
| Génère une description d'exécution à partir d'un prompt unique, | |
| avec le même pipeline que dans le notebook d'entraînement. | |
| """ | |
| prompt = (prompt or "").strip() | |
| if not prompt: | |
| return "No prompt available to generate execution." | |
| tokenizer, model = _load_exec_model() | |
| sample_prompts = [prompt] | |
| encodings = tokenizer( | |
| sample_prompts, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| ) | |
| encodings = {k: v.to(DEVICE) for k, v in encodings.items()} | |
| # 1. Génération | |
| outputs = model.generate( | |
| **encodings, | |
| max_new_tokens=max_new_tokens, # nombre maximum de tokens générés | |
| min_new_tokens=50, # longueur minimale | |
| do_sample=True, # sampling activé (obligatoire pour top-k / top-p) | |
| temperature=temperature, # légère "chauffe" pour plus de variété | |
| top_k=top_k, # top-K sampling large | |
| top_p=top_p, # nucleus sampling | |
| no_repeat_ngram_size=3, # évite les répétitions de 3-grams | |
| num_beams=5, # beam search pour améliorer la cohérence | |
| num_return_sequences=1, # une seule génération par prompt | |
| pad_token_id=tokenizer.eos_token_id, | |
| early_stopping=True, # arrêt anticipé si EOS atteint partout | |
| ) | |
| generated_outputs = tokenizer.batch_decode( | |
| outputs, | |
| skip_special_tokens=True, | |
| ) | |
| # 2. On enlève le prompt au début de chaque génération | |
| cleaned_generations = strip_prompt_from_generations( | |
| prompts=sample_prompts, | |
| generated_outputs=generated_outputs, | |
| ) | |
| # 3. Post-traitement élément par élément | |
| trimmed = [trim_to_last_full_sentence(txt) for txt in cleaned_generations] | |
| limited = [keep_first_n_sentences(t, n=2) for t in trimmed] | |
| deduped = [dedupe_muscle_list(t) for t in limited] | |
| final = [clean_execution_text(t) for t in deduped] | |
| return final[0] if final else "" | |
| def get_dl_execution_model_report_components(): | |
| """ | |
| Retourne 4 DataFrames Gradio-ready : | |
| - Summary | |
| - Model | |
| - Training | |
| - Metrics | |
| Si pas de rapport → retourne les DF vides. | |
| """ | |
| report_path = REPORT_PATH | |
| if not report_path or not report_path.exists(): | |
| return _empty_dl_dfs() | |
| try: | |
| data = json.loads(report_path.read_text(encoding="utf-8")) | |
| except Exception as e: | |
| print(f"[DL REPORT] Error while reading {report_path}: {e}") | |
| return _empty_dl_dfs() | |
| # ===== Summary ===== | |
| dataset = data.get("dataset", {}) | |
| summary_rows = [ | |
| ("created_at", data.get("created_at", "")), | |
| ("task", data.get("task", "")), | |
| ("target", data.get("target", "")), | |
| ("framework", data.get("framework", "")), | |
| ("dataset.file", dataset.get("file", "")), | |
| ("dataset.size_bytes", dataset.get("size_bytes", "")), | |
| ("dataset.tokens", dataset.get("tokens", "")), | |
| ] | |
| df_summary = pd.DataFrame(summary_rows, columns=["Key", "Value"]) | |
| # ===== Model config ===== | |
| model_cfg = data.get("model", {}) or {} | |
| df_model = pd.DataFrame( | |
| [(k, v) for k, v in model_cfg.items()], | |
| columns=["Key", "Value"], | |
| ) | |
| # ===== Training ===== | |
| training_cfg = data.get("training", {}) or {} | |
| df_training = pd.DataFrame( | |
| [(k, v) for k, v in training_cfg.items()], | |
| columns=["Key", "Value"], | |
| ) | |
| # ===== Metrics ===== | |
| metrics_cfg = data.get("metrics", {}) or {} | |
| df_metrics = pd.DataFrame( | |
| [(k, v) for k, v in metrics_cfg.items()], | |
| columns=["Metric", "Value"], | |
| ) | |
| return df_summary, df_model, df_training, df_metrics | |
| def _empty_dl_dfs(): | |
| df_summary = pd.DataFrame({"Key": [], "Value": []}) | |
| df_model = pd.DataFrame({"Key": [], "Value": []}) | |
| df_training = pd.DataFrame({"Key": [], "Value": []}) | |
| df_metrics = pd.DataFrame({"Metric": [], "Value": []}) | |
| return df_summary, df_model, df_training, df_metrics | |