trAIn.me_v3 / src /gradio /generators /execution_generator.py
AIppyDev's picture
Pareil
8cb0c1d
import json
from pathlib import Path
from functools import lru_cache
from typing import Mapping, Union, List
import re
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# ---------------------------------------------------------------------
# Paths + device
# ---------------------------------------------------------------------
PROJECT_ROOT = Path(__file__).resolve().parents[2]
MODEL_DIR = PROJECT_ROOT / "config"
# Dossier de ton modèle finetuné d'exécution
# EXEC_MODEL_DIR = MODEL_DIR / "transformer_execution_generator_v3"
# Chargement du modèle HF (tokenizer + modèle)
MODEL_REPO = "AIppyDev/transformer_execution_generator_v3"
MODEL_SUBFOLDER = "transformer_execution_generator_v3" # le nom du dossier dans le repo
REPORT_PATH = MODEL_DIR / "execution_generator_model_report.json"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Nombre max de tokens générés (équivalent MAX_LENGTH du notebook)
EXEC_MAX_NEW_TOKENS = 180
# ---------------------------------------------------------------------
# Construction du prompt pour un programme
# ---------------------------------------------------------------------
def build_execution_prompt(row: Union[pd.Series, Mapping, None]) -> str:
"""
Construit le prompt d'exécution à partir d'une ligne de programme.
Format :
The {exercise_name} exercise for "{target_muscles}" with "{equipment}" on "{difficulty}" level:
"""
if row is None:
return ""
def get_val(key: str, default: str = "") -> str:
if isinstance(row, pd.Series):
val = row.get(key, default)
else:
val = row.get(key, default) if hasattr(row, "get") else default
if pd.isna(val):
return default
return str(val).strip()
name = get_val("exercise_name", "this exercise")
muscle = get_val("target_muscles", "the target muscles")
equipment = get_val("equipment", "bodyweight")
difficulty = get_val("difficulty", "General")
prompt = (
f'The {name} exercise for '
f'"{muscle}" with '
f'"{equipment}" on '
f'"{difficulty}" level:'
)
return prompt
# ---------------------------------------------------------------------
# Utils de post-traitement
# ---------------------------------------------------------------------
def strip_prompt_from_generations(prompts: List[str], generated_outputs: List[str]) -> List[str]:
"""
Supprime le prompt au début de chaque génération si le modèle l'a recopié.
"""
cleaned = []
for prompt, gen in zip(prompts, generated_outputs):
gen_strip = gen.strip()
if gen_strip.startswith(prompt):
cleaned.append(gen_strip[len(prompt):].strip())
else:
cleaned.append(gen_strip)
return cleaned
def trim_to_last_full_sentence(text: str) -> str:
"""
Coupe le texte à la dernière phrase complète.
On garde tout jusqu'au dernier '.', '!' ou '?'.
Si aucun n'est trouvé, on renvoie le texte brut stripé.
"""
text = text.strip()
last_dot = text.rfind(".")
last_excl = text.rfind("!")
last_q = text.rfind("?")
last_punct = max(last_dot, last_excl, last_q)
if last_punct != -1:
return text[: last_punct + 1].strip()
return text
def keep_first_n_sentences(text: str, n: int = 2) -> str:
"""
Garde seulement les n premières phrases (séparation grossière sur . ! ?).
"""
sentences = re.split(r'([.!?])', text)
chunks = []
count = 0
for i in range(0, len(sentences) - 1, 2):
sentence = sentences[i].strip()
punct = sentences[i + 1].strip()
if sentence:
chunks.append(sentence + punct)
count += 1
if count >= n:
break
return " ".join(chunks).strip() if chunks else text.strip()
def dedupe_muscle_list(text: str) -> str:
"""
Détecte les listes du type 'glutes, hamstrings, and hamstrings'
dans la phrase, les déduplique et les reformate proprement.
"""
# Regex qui capture une liste séparée par virgules ou 'and'
# Exemple capturé : "glutes, hamstrings, and hamstrings"
pattern = r"([A-Za-z ]+(?:,\s*[A-Za-z ]+)*(?:\s+and\s+[A-Za-z ]+)?)"
def process_match(match):
segment = match.group(0)
# Split sur virgules et 'and'
parts = re.split(r",|\band\b", segment)
parts = [p.strip() for p in parts if p.strip()]
# Déduplique tout en gardant l’ordre
seen = set()
unique = []
for p in parts:
if p.lower() not in seen:
seen.add(p.lower())
unique.append(p)
# Reconstruction naturelle
if len(unique) == 1:
return unique[0]
if len(unique) == 2:
return f"{unique[0]} and {unique[1]}"
return ", ".join(unique[:-1]) + f", and {unique[-1]}"
# Applique la fonction sur toutes les occurrences
cleaned = re.sub(pattern, process_match, text)
return cleaned
def clean_execution_text(text: str) -> str:
"""
Nettoyage final : espaces, 'reps', etc.
"""
if not isinstance(text, str):
return text
cleaned = text
# 1) Ajouter un espace après un point si collé à une majuscule
cleaned = re.sub(r'(\.)([A-Z])', r'\1 \2', cleaned)
# 2) Ajouter un espace après "such as" si collé à un chiffre
cleaned = re.sub(r"(such as)(\d)", r"\1 \2", cleaned)
# 3) Ajouter un espace avant "reps" si collé au chiffre
cleaned = re.sub(r"(\d)(reps)", r"\1 reps", cleaned)
# 4) Normaliser les doubles espaces
cleaned = re.sub(r"\s{2,}", " ", cleaned).strip()
return cleaned
# ---------------------------------------------------------------------
# Chargement du modèle HF (tokenizer + modèle)
# ---------------------------------------------------------------------
@lru_cache()
def _load_exec_model():
"""
Charge une seule fois tokenizer + modèle pour l'execution generator.
Utilise un cache pour éviter les rechargements coûteux.
"""
tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO,
subfolder=MODEL_SUBFOLDER,)
model = AutoModelForCausalLM.from_pretrained(
MODEL_REPO,
subfolder=MODEL_SUBFOLDER,
torch_dtype=torch.float32, # CPU friendly
)
model.to(DEVICE)
model.eval()
return tokenizer, model
# ---------------------------------------------------------------------
# Génération de texte pour l'exécution (1 prompt, pipeline complet)
# ---------------------------------------------------------------------
def generate_execution_text(
prompt: str,
max_new_tokens: int = EXEC_MAX_NEW_TOKENS,
temperature: float = 0.8,
top_k: int = 250,
top_p: float = 0.92,
) -> str:
"""
Génère une description d'exécution à partir d'un prompt unique,
avec le même pipeline que dans le notebook d'entraînement.
"""
prompt = (prompt or "").strip()
if not prompt:
return "No prompt available to generate execution."
tokenizer, model = _load_exec_model()
sample_prompts = [prompt]
encodings = tokenizer(
sample_prompts,
return_tensors="pt",
padding=True,
truncation=True,
)
encodings = {k: v.to(DEVICE) for k, v in encodings.items()}
# 1. Génération
outputs = model.generate(
**encodings,
max_new_tokens=max_new_tokens, # nombre maximum de tokens générés
min_new_tokens=50, # longueur minimale
do_sample=True, # sampling activé (obligatoire pour top-k / top-p)
temperature=temperature, # légère "chauffe" pour plus de variété
top_k=top_k, # top-K sampling large
top_p=top_p, # nucleus sampling
no_repeat_ngram_size=3, # évite les répétitions de 3-grams
num_beams=5, # beam search pour améliorer la cohérence
num_return_sequences=1, # une seule génération par prompt
pad_token_id=tokenizer.eos_token_id,
early_stopping=True, # arrêt anticipé si EOS atteint partout
)
generated_outputs = tokenizer.batch_decode(
outputs,
skip_special_tokens=True,
)
# 2. On enlève le prompt au début de chaque génération
cleaned_generations = strip_prompt_from_generations(
prompts=sample_prompts,
generated_outputs=generated_outputs,
)
# 3. Post-traitement élément par élément
trimmed = [trim_to_last_full_sentence(txt) for txt in cleaned_generations]
limited = [keep_first_n_sentences(t, n=2) for t in trimmed]
deduped = [dedupe_muscle_list(t) for t in limited]
final = [clean_execution_text(t) for t in deduped]
return final[0] if final else ""
def get_dl_execution_model_report_components():
"""
Retourne 4 DataFrames Gradio-ready :
- Summary
- Model
- Training
- Metrics
Si pas de rapport → retourne les DF vides.
"""
report_path = REPORT_PATH
if not report_path or not report_path.exists():
return _empty_dl_dfs()
try:
data = json.loads(report_path.read_text(encoding="utf-8"))
except Exception as e:
print(f"[DL REPORT] Error while reading {report_path}: {e}")
return _empty_dl_dfs()
# ===== Summary =====
dataset = data.get("dataset", {})
summary_rows = [
("created_at", data.get("created_at", "")),
("task", data.get("task", "")),
("target", data.get("target", "")),
("framework", data.get("framework", "")),
("dataset.file", dataset.get("file", "")),
("dataset.size_bytes", dataset.get("size_bytes", "")),
("dataset.tokens", dataset.get("tokens", "")),
]
df_summary = pd.DataFrame(summary_rows, columns=["Key", "Value"])
# ===== Model config =====
model_cfg = data.get("model", {}) or {}
df_model = pd.DataFrame(
[(k, v) for k, v in model_cfg.items()],
columns=["Key", "Value"],
)
# ===== Training =====
training_cfg = data.get("training", {}) or {}
df_training = pd.DataFrame(
[(k, v) for k, v in training_cfg.items()],
columns=["Key", "Value"],
)
# ===== Metrics =====
metrics_cfg = data.get("metrics", {}) or {}
df_metrics = pd.DataFrame(
[(k, v) for k, v in metrics_cfg.items()],
columns=["Metric", "Value"],
)
return df_summary, df_model, df_training, df_metrics
def _empty_dl_dfs():
df_summary = pd.DataFrame({"Key": [], "Value": []})
df_model = pd.DataFrame({"Key": [], "Value": []})
df_training = pd.DataFrame({"Key": [], "Value": []})
df_metrics = pd.DataFrame({"Metric": [], "Value": []})
return df_summary, df_model, df_training, df_metrics