Spaces:

Ali428
/

ParakeetTranscriptionModel

Sleeping

File size: 7,843 Bytes

34ac5c4
98a891c
34ac5c4
 
 
98a891c
 
 
f3ada2c
29c2fd9
fa01798
29c2fd9
f3ada2c
29c2fd9
f3ada2c
34ac5c4
29c2fd9
fa01798
f3ada2c
29c2fd9
f3ada2c
 
29c2fd9
f3ada2c
 
 
29c2fd9
 
f3ada2c
29c2fd9
 
f3ada2c
29c2fd9
34ac5c4
fa01798
 
f3ada2c
 
98a891c
f3ada2c
98a891c
 
 
 
 
 
 
 
fa01798
98a891c
 
4d3b6d4
 
 
 
 
 
98a891c
 
 
 
4d3b6d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98a891c
4d3b6d4
98a891c
4d3b6d4
 
 
 
 
fa01798
 
 
98a891c
4d3b6d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98a891c
4d3b6d4
 
98a891c
 
fa01798
98a891c
 
 
 
 
 
 
 
 
 
 
 
 
fa01798
98a891c
5a94ac4
98a891c
 
 
 
 
 
 
5a94ac4
 
 
 
 
 
 
 
 
 
 
98a891c
5a94ac4
98a891c
fa01798
98a891c
 
 
 
 
 
 
 
 
34ac5c4
f3ada2c
fa01798
34ac5c4
8862332
 
fa01798
 
98a891c
fa01798
34ac5c4
 
98a891c

import os
import json
import gradio as gr
import nemo.collections.asr as nemo_asr
import spaces
import librosa
import numpy as np
import torch
from huggingface_hub import hf_hub_download

# 1. SETTINGS - Matches your public repo perfectly
REPO_ID = "Ali428/parakeet-weights"
FILENAME = "parakeet_final.nemo"

print(f"🚀 Initializing FYP ASR Engine...")

try:
    # 2. DOWNLOAD THE FILE (No token needed for public repos)
    print(f"Downloading {FILENAME} from {REPO_ID}...")
    model_path = hf_hub_download(
        repo_id=REPO_ID, 
        filename=FILENAME
    )
    
    # 3. LOAD THE NEMO MODEL
    print(f"Loading model into memory...")
    model = nemo_asr.models.ASRModel.restore_from(model_path)
    model.eval()
    print("✅ Model loaded successfully!")

except Exception as e:
    print(f"❌ ERROR DURING STARTUP: {e}")
    raise e

# 4. TRANSCRIPTION FUNCTION
@spaces.GPU(duration=120)  # Increased duration slightly for long videos
def transcribe(audio_path):
    if audio_path is None:
        return json.dumps({"success": False, "error": "Please upload an audio file."})
    
    try:
        # 1. Get audio duration using librosa
        audio, sr = librosa.load(audio_path, sr=16000, mono=True)
        duration = len(audio) / sr
        
        # 2. Transcribe with return_hypotheses=True to get timestamps
        transcription = model.transcribe([audio_path], return_hypotheses=True)
        
        # Extract the hypothesis object
        hypothesis = transcription[0][0] if isinstance(transcription[0], list) else transcription[0]
        full_text = hypothesis.text

        # DEBUG: log timestep info to diagnose CPU vs GPU timestamp differences
        _ts = getattr(hypothesis, 'timestep', None)
        print(f"DEBUG has_timestep={hasattr(hypothesis, 'timestep')} | type={type(_ts)} | len={len(_ts) if _ts is not None and hasattr(_ts, '__len__') else 'N/A'} | sample={_ts[:5] if _ts is not None and hasattr(_ts, '__len__') and len(_ts) > 0 else _ts}")
        print(f"DEBUG has_words={hasattr(hypothesis, 'words')} | device=cuda:{torch.cuda.is_available()}")

        
        # 3. Process timestamps
        segments = []
        try:
            timestep = getattr(hypothesis, 'timestep', None)
            words = getattr(hypothesis, 'words', None) or full_text.split()

            # DEBUG: log to HF Space logs so we can inspect CPU behaviour
            print(f"DEBUG timestep type={type(timestep)} | has_words={hasattr(hypothesis, 'words')} | cuda={torch.cuda.is_available()}")
            if timestep is not None:
                print(f"DEBUG timestep value (raw)={str(timestep)[:200]}")

            # NeMo may return timestep as a Tensor, list, dict, or None
            if timestep is None:
                raise ValueError("timestep is None — will use fallback")

            # If it's a dict (e.g. {'timestep': [...], 'word': [...]})
            if isinstance(timestep, dict):
                timestep = timestep.get('timestep') or timestep.get('word') or list(timestep.values())[0]

            # Convert to numpy array
            if isinstance(timestep, torch.Tensor):
                timestep_array = timestep.cpu().numpy()
            else:
                timestep_array = np.array(timestep, dtype=np.float32)

            if len(timestep_array) == 0:
                raise ValueError("timestep array is empty — will use fallback")

            # Parakeet TDT uses 8x encoder subsampling (4x conv + 2x additional)
            # so each encoder frame = 8 * 10ms = 80ms of real audio time
            frame_duration = 0.08
            words_per_segment = 15

            frames_per_word = len(timestep_array) / len(words)
            word_timestamps = []

            for i, word in enumerate(words):
                start_frame = int(i * frames_per_word)
                end_frame   = int((i + 1) * frames_per_word)
                start_frame = min(start_frame, len(timestep_array) - 1)
                end_frame   = min(end_frame,   len(timestep_array))

                if end_frame > start_frame:
                    actual_start_frame = int(timestep_array[start_frame])
                    actual_end_frame   = int(timestep_array[min(end_frame - 1, len(timestep_array) - 1)])
                else:
                    actual_start_frame = int(timestep_array[start_frame])
                    actual_end_frame   = actual_start_frame

                word_timestamps.append({
                    'word':  word,
                    'start': round(actual_start_frame * frame_duration, 3),
                    'end':   round(actual_end_frame   * frame_duration, 3),
                })

            # Group into segments of words_per_segment words each
            for i in range(0, len(word_timestamps), words_per_segment):
                seg = word_timestamps[i:i + words_per_segment]
                if seg:
                    segments.append({
                        'text':  ' '.join(w['word'] for w in seg),
                        'start': seg[0]['start'],
                        'end':   seg[-1]['end'],
                    })


        except Exception as e:
            print(f"Warning: Timestamp extraction failed: {e}")
            # Fallback segments method
            words = full_text.split()
            words_per_segment = 15
            segment_duration = duration / max(1, len(words) / words_per_segment)
            
            for i in range(0, len(words), words_per_segment):
                segment_words = words[i:i + words_per_segment]
                segment_idx = i // words_per_segment
                segments.append({
                    'text': ' '.join(segment_words),
                    'start': round(segment_idx * segment_duration, 3),
                    'end': round((segment_idx + 1) * segment_duration, 3)
                })

        # 4. Construct response identical to local backend
        word_count = len(full_text.split())

        result = {
            "success": True,
            "text": full_text,
            "segment_timestamps": segments,
            "model": "nvidia/parakeet-tdt-0.6b-v3",
            "device": "cuda" if torch.cuda.is_available() else "cpu",
            "audio_duration": round(duration, 2),
            "word_count": word_count,
            # DEBUG: remove once timestamps are confirmed working
            "debug_info": {
                "has_timestep": hasattr(hypothesis, 'timestep'),
                "timestep_type": str(type(getattr(hypothesis, 'timestep', None))),
                "timestep_len": len(getattr(hypothesis, 'timestep', [])) if hasattr(hypothesis, 'timestep') else 0,
                "timestep_sample": str(getattr(hypothesis, 'timestep', None))[:100] if hasattr(hypothesis, 'timestep') else None,
                "has_words": hasattr(hypothesis, 'words'),
                "cuda_available": torch.cuda.is_available(),
                "segments_from_fallback": len(segments) > 0 and segments[-1]['end'] > 190,
            }
        }

        
        # Return as pretty JSON string for the Gradio UI
        return json.dumps(result, indent=2)
        
    except Exception as e:
        import traceback
        traceback.print_exc()
        return json.dumps({
            "success": False,
            "error": str(e)
        }, indent=2)

# 5. GRADIO INTERFACE
# For API usage, we can skip the UI, but we'll leave it for debugging
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath", label="Upload Audio"),
    outputs=gr.Code(language="json", label="Transcription Result JSON"), # Changed to Code/JSON for better formatting
    title="Ali's FYP: Parakeet ASR Engine",
    description="Running on NVIDIA A100 (ZeroGPU). API-ready endpoint.",
    api_name="transcribe" # Allows api requests to /api/transcribe
)

if __name__ == "__main__":
    demo.launch()