Spaces:

proz
/

api_cnam

Running

App Files Files Community

proz commited on Jan 24

Commit

3789ef8

verified ·

1 Parent(s): ba25791

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +26 -0
app.py +101 -0
download_model.py +8 -0
requirements.txt +9 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.10-slim
+# Installation des dépendances système audio
+RUN apt-get update && apt-get install -y \
+    libsndfile1 \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Installation des librairies Python
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Téléchargement du modèle PENDANT le build
+COPY download_model.py .
+RUN python download_model.py
+# Copie du code de l'application
+COPY . .
+# Droits d'accès pour Hugging Face
+RUN chmod -R 777 /app
+# Démarrage
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+import torch
+import librosa
+import io
+import numpy as np
+from contextlib import asynccontextmanager
+# Configuration
+MODEL_ID = "Cnam-LMSSC/wav2vec2-french-phonemizer"
+ai_context = {}
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    print("🚀 Chargement du modèle Cnam avec Masque...")
+    try:
+        # Chargement du processeur et du modèle
+        processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
+        model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+        model.eval() # Mode lecture seule (plus rapide)
+        ai_context["processor"] = processor
+        ai_context["model"] = model
+        # On stocke le vocabulaire pour le masque (ex: 'a': 12, 'b': 14)
+        ai_context["vocab"] = processor.tokenizer.get_vocab()
+        print("✅ Modèle prêt et vocabulaire indexé.")
+    except Exception as e:
+        print(f"❌ Erreur critique : {e}")
+    yield
+    ai_context.clear()
+app = FastAPI(lifespan=lifespan)
+@app.get("/")
+def home():
+    return {"status": "Model Cnam Masked is running"}
+@app.post("/transcribe")
+async def transcribe(
+    file: UploadFile = File(...),
+    allowed_phones: str = Form(...) # Ce champ est OBLIGATOIRE
+):
+    if "model" not in ai_context:
+        raise HTTPException(status_code=500, detail="Modèle non chargé")
+    # 1. Lecture Audio avec Librosa (force 16kHz)
+    try:
+        content = await file.read()
+        # On utilise io.BytesIO pour lire depuis la mémoire sans fichier temporaire
+        audio_array, _ = librosa.load(io.BytesIO(content), sr=16000)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Erreur fichier audio: {str(e)}")
+    # 2. Préparation Modèle
+    processor = ai_context["processor"]
+    model = ai_context["model"]
+    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)
+    # 3. Calcul des Logits (Probabilités brutes avant décision)
+    with torch.no_grad():
+        logits = model(inputs.input_values).logits
+    # --- 4. APPLICATION DU MASQUE BINAIRE ---
+    # On récupère la liste demandée (ex: "a,i,o")
+    requested_phones = [p.strip() for p in allowed_phones.split(',') if p.strip()]
+    if requested_phones:
+        vocab = ai_context["vocab"]
+        # Tokens techniques indispensables pour que le CTC fonctionne (silence, padding...)
+        # Le modèle Cnam utilise '|' comme séparateur de mot/silence
+        technical_tokens = ["<pad>", "<s>", "</s>", "<unk>", "|", "[PAD]", "[UNK]"]
+        # On construit l'ensemble des tokens autorisés
+        full_allowed_set = set(requested_phones + technical_tokens)
+        # On trouve leurs positions numériques (ID) dans le cerveau du modèle
+        allowed_indices = [vocab[t] for t in full_allowed_set if t in vocab]
+        if allowed_indices:
+            # Création du masque : Par défaut, tout est interdit (-Infini)
+            mask = torch.full((logits.shape[-1],), float('-inf'))
+            # On ouvre les portes seulement pour les indices autorisés (0.0)
+            mask[allowed_indices] = 0.0
+            # On applique le masque aux logits
+            logits = logits + mask
+    # 5. Décodage final (Argmax)
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    return {
+        "ipa": transcription,
+        "allowed_used": allowed_phones
+    }

download_model.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+MODEL_ID = "Cnam-LMSSC/wav2vec2-french-phonemizer"
+print(f"⬇️ Téléchargement du modèle {MODEL_ID}...")
+processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
+model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+print("✅ Modèle téléchargé et mis en cache.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn
+python-multipart
+torch --extra-index-url https://download.pytorch.org/whl/cpu
+transformers
+librosa
+soundfile
+numpy
+scipy