import gradio as gr
from transformers import pipeline
import torch
import librosa
import torch.nn.functional as F

# Load the engine
print("Loading Strict Pronunciation Engine...")
asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

def assess_pronunciation(audio_filepath, target_text):
    if not audio_filepath or not target_text:
        return {"error": "Missing input"}
    
    # --- FIXED INDENTATION STARTS HERE ---
    try:
        # 1. Process Audio
        audio, sr = librosa.load(audio_filepath, sr=16000)
        
        # 2. Strict Scoring (Confidence Analysis)
        # We check how 'confident' the model is about your sounds
        with torch.no_grad():
            logits = asr_pipe.model(torch.tensor(audio).unsqueeze(0)).logits
        
        probs = F.softmax(logits, dim=-1)
        # We calculate the average confidence across the whole clip
        confidence = float(torch.mean(torch.max(probs, dim=-1).values))
        
        # 3. Transcription for feedback
        transcription_result = asr_pipe(audio_filepath)
        said = transcription_result["text"].lower()

        # Strict Logic: Penalty for thick accents or mumbling
        # We scale the 0-1 confidence into a 0-100 score with a difficulty curve
        accuracy = round((confidence ** 2) * 100) 
        
        # Fluency calculation (Characters per second)
        duration = len(audio) / sr
        fluency = min(100, round((len(said) / max(duration, 1)) * 10))

        return {
            "accuracy_score": accuracy,
            "fluency_score": fluency,
            "completeness_score": 100 if accuracy > 70 else 80,
            "transcription": said
        }
    except Exception as e:
        return {"error": str(e)}

# Gradio 3 Interface
interface = gr.Interface(
    fn=assess_pronunciation,
    inputs=[gr.Audio(source="upload", type="filepath"), gr.Textbox(label="Target Text")],
    outputs=gr.JSON(),
)

if __name__ == "__main__":
    interface.launch()