Spaces:

proz
/

api_cnam

Running

App Files Files Community

proz commited on Jan 25

Commit

b0b4663

verified ·

1 Parent(s): fc1a510

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -46

app.py CHANGED Viewed

@@ -6,26 +6,26 @@ import io
 import numpy as np
 from contextlib import asynccontextmanager
-# Configuration V1
 MODEL_ID = "Cnam-LMSSC/wav2vec2-french-phonemizer"
 ai_context = {}
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    print(f"🚀 Chargement du modèle V1 {MODEL_ID}...")
     try:
-        # Chargement du processeur et du modèle
         processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
         model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
-        model.eval() # Mode lecture seule (plus rapide)
         ai_context["processor"] = processor
         ai_context["model"] = model
-        # On stocke le vocabulaire pour le masque (ex: 'a': 12, 'b': 14)
         ai_context["vocab"] = processor.tokenizer.get_vocab()
-        print("✅ Modèle V1 prêt et vocabulaire indexé.")
     except Exception as e:
         print(f"❌ Erreur critique : {e}")
     yield
@@ -35,84 +35,88 @@ app = FastAPI(lifespan=lifespan)
 @app.get("/")
 def home():
-    return {"status": "Model Cnam V1 Masked is running"}
-# --- AJOUT POUR DIAGNOSTIC ---
-@app.get("/vocab")
-def get_vocab():
-    """Renvoie le dictionnaire complet de la V1 pour comparaison"""
-    if "vocab" not in ai_context:
-        return {"error": "Modèle non chargé"}
-    # On trie le dictionnaire par ordre alphabétique des clés pour faciliter la lecture
-    sorted_vocab = dict(sorted(ai_context["vocab"].items()))
-    return {
-        "model": MODEL_ID,
-        "total_tokens": len(sorted_vocab),
-        "tokens": sorted_vocab
-    }
-# -----------------------------
 @app.post("/transcribe")
 async def transcribe(
     file: UploadFile = File(...),
-    allowed_phones: str = Form(...) # Ce champ est OBLIGATOIRE
 ):
     if "model" not in ai_context:
         raise HTTPException(status_code=500, detail="Modèle non chargé")
-    # 1. Lecture Audio avec Librosa (force 16kHz)
     try:
         content = await file.read()
-        # On utilise io.BytesIO pour lire depuis la mémoire sans fichier temporaire
         audio_array, _ = librosa.load(io.BytesIO(content), sr=16000)
     except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Erreur fichier audio: {str(e)}")
     # 2. Préparation Modèle
     processor = ai_context["processor"]
     model = ai_context["model"]
     inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)
-    # 3. Calcul des Logits (Probabilités brutes avant décision)
     with torch.no_grad():
         logits = model(inputs.input_values).logits
-    # --- 4. APPLICATION DU MASQUE BINAIRE ---
-    # On récupère la liste demandée (ex: "a,i,o")
     requested_phones = [p.strip() for p in allowed_phones.split(',') if p.strip()]
     if requested_phones:
-        vocab = ai_context["vocab"]
-        # Tokens techniques indispensables pour que le CTC fonctionne (silence, padding...)
-        # Le modèle Cnam utilise '|' comme séparateur de mot/silence
         technical_tokens = ["<pad>", "<s>", "</s>", "<unk>", "|", "[PAD]", "[UNK]"]
-        # On construit l'ensemble des tokens autorisés
         full_allowed_set = set(requested_phones + technical_tokens)
-        # On trouve leurs positions numériques (ID) dans le cerveau du modèle
         allowed_indices = [vocab[t] for t in full_allowed_set if t in vocab]
         if allowed_indices:
-            # Création du masque : Par défaut, tout est interdit (-Infini)
             mask = torch.full((logits.shape[-1],), float('-inf'))
-            # On ouvre les portes seulement pour les indices autorisés (0.0)
             mask[allowed_indices] = 0.0
-            # On applique le masque aux logits
             logits = logits + mask
-    # 5. Décodage final (Argmax)
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     return {
         "ipa": transcription,
-        "allowed_used": allowed_phones
     }

 import numpy as np
 from contextlib import asynccontextmanager
+# --- CONFIGURATION V1 ---
 MODEL_ID = "Cnam-LMSSC/wav2vec2-french-phonemizer"
 ai_context = {}
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    print(f"🚀 Chargement V1 (Stable) : {MODEL_ID}...")
     try:
         processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
         model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+        model.eval()
         ai_context["processor"] = processor
         ai_context["model"] = model
         ai_context["vocab"] = processor.tokenizer.get_vocab()
+        # Inversion vocab pour le debug
+        ai_context["id2vocab"] = {v: k for k, v in ai_context["vocab"].items()}
+        print("✅ Modèle V1 prêt.")
     except Exception as e:
         print(f"❌ Erreur critique : {e}")
     yield
 @app.get("/")
 def home():
+    return {"status": "API V1 Stable running", "model": MODEL_ID}
 @app.post("/transcribe")
 async def transcribe(
     file: UploadFile = File(...),
+    allowed_phones: str = Form(...)
 ):
     if "model" not in ai_context:
         raise HTTPException(status_code=500, detail="Modèle non chargé")
+    # 1. Lecture Audio (Simple, sans boost ni loop)
     try:
         content = await file.read()
         audio_array, _ = librosa.load(io.BytesIO(content), sr=16000)
     except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Erreur audio: {str(e)}")
     # 2. Préparation Modèle
     processor = ai_context["processor"]
     model = ai_context["model"]
+    vocab = ai_context["vocab"]
+    id2vocab = ai_context["id2vocab"]
     inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)
+    # 3. Calcul des Logits
     with torch.no_grad():
         logits = model(inputs.input_values).logits
+    # --- DEBUG TRACE (Utile pour voir ce que la V1 entend vraiment) ---
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    top3_probs, top3_ids = torch.topk(probs, 3, dim=-1)
+    debug_trace = []
+    time_steps = top3_ids.shape[1]
+    for t in range(time_steps):
+        best_id = top3_ids[0, t, 0].item()
+        best_token = id2vocab.get(best_id, "")
+        best_prob = top3_probs[0, t, 0].item()
+        if best_token not in ["<pad>", "<s>", "</s>", "|"] and best_prob > 0.05:
+            frame_info = []
+            for k in range(3):
+                alt_id = top3_ids[0, t, k].item()
+                alt_token = id2vocab.get(alt_id, "UNK")
+                alt_prob = top3_probs[0, t, k].item()
+                frame_info.append(f"{alt_token} ({int(alt_prob*100)}%)")
+            debug_trace.append(f"T{t}: " + " | ".join(frame_info))
+    # Capture du Raw avant masque
+    raw_ids = torch.argmax(logits, dim=-1)
+    raw_ipa = processor.batch_decode(raw_ids, skip_special_tokens=True)[0]
+    # --- 4. APPLICATION DU MASQUE BINAIRE (Logique V1) ---
+    # La V1 gère les blocs entiers (ex: ɑ̃), pas besoin de décomposition complexe
     requested_phones = [p.strip() for p in allowed_phones.split(',') if p.strip()]
     if requested_phones:
+        # Tokens techniques V1 (Le modèle V1 utilise '|' et d'autres)
         technical_tokens = ["<pad>", "<s>", "</s>", "<unk>", "|", "[PAD]", "[UNK]"]
         full_allowed_set = set(requested_phones + technical_tokens)
+        # On trouve leurs positions numériques (ID)
+        # Note : La V1 est "gentille", si on demande 'ɑ̃', elle a un token pour ça.
         allowed_indices = [vocab[t] for t in full_allowed_set if t in vocab]
         if allowed_indices:
             mask = torch.full((logits.shape[-1],), float('-inf'))
             mask[allowed_indices] = 0.0
             logits = logits + mask
+    # 5. Décodage final
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     return {
         "ipa": transcription,
+        "raw_ipa_before_mask": raw_ipa,
+        "debug_trace": debug_trace,
+        "allowed_used": allowed_phones,
+        "version": "V1_Classic"
     }