Spaces:

Ali428
/

ParakeetTranscriptionModel

Sleeping

App Files Files Community

ParakeetTranscriptionModel / app.py

Ali428

Update app.py

5a94ac4 verified about 1 month ago

raw

history blame contribute delete

7.84 kB

	import os
	import json
	import gradio as gr
	import nemo.collections.asr as nemo_asr
	import spaces
	import librosa
	import numpy as np
	import torch
	from huggingface_hub import hf_hub_download

	# 1. SETTINGS - Matches your public repo perfectly
	REPO_ID = "Ali428/parakeet-weights"
	FILENAME = "parakeet_final.nemo"

	print(f"🚀 Initializing FYP ASR Engine...")

	try:
	# 2. DOWNLOAD THE FILE (No token needed for public repos)
	print(f"Downloading {FILENAME} from {REPO_ID}...")
	model_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME
	)

	# 3. LOAD THE NEMO MODEL
	print(f"Loading model into memory...")
	model = nemo_asr.models.ASRModel.restore_from(model_path)
	model.eval()
	print("✅ Model loaded successfully!")

	except Exception as e:
	print(f"❌ ERROR DURING STARTUP: {e}")
	raise e

	# 4. TRANSCRIPTION FUNCTION
	@spaces.GPU(duration=120) # Increased duration slightly for long videos
	def transcribe(audio_path):
	if audio_path is None:
	return json.dumps({"success": False, "error": "Please upload an audio file."})

	try:
	# 1. Get audio duration using librosa
	audio, sr = librosa.load(audio_path, sr=16000, mono=True)
	duration = len(audio) / sr

	# 2. Transcribe with return_hypotheses=True to get timestamps
	transcription = model.transcribe([audio_path], return_hypotheses=True)

	# Extract the hypothesis object
	hypothesis = transcription[0][0] if isinstance(transcription[0], list) else transcription[0]
	full_text = hypothesis.text

	# DEBUG: log timestep info to diagnose CPU vs GPU timestamp differences
	_ts = getattr(hypothesis, 'timestep', None)
	print(f"DEBUG has_timestep={hasattr(hypothesis, 'timestep')} \| type={type(_ts)} \| len={len(_ts) if _ts is not None and hasattr(_ts, '__len__') else 'N/A'} \| sample={_ts[:5] if _ts is not None and hasattr(_ts, '__len__') and len(_ts) > 0 else _ts}")
	print(f"DEBUG has_words={hasattr(hypothesis, 'words')} \| device=cuda:{torch.cuda.is_available()}")


	# 3. Process timestamps
	segments = []
	try:
	timestep = getattr(hypothesis, 'timestep', None)
	words = getattr(hypothesis, 'words', None) or full_text.split()

	# DEBUG: log to HF Space logs so we can inspect CPU behaviour
	print(f"DEBUG timestep type={type(timestep)} \| has_words={hasattr(hypothesis, 'words')} \| cuda={torch.cuda.is_available()}")
	if timestep is not None:
	print(f"DEBUG timestep value (raw)={str(timestep)[:200]}")

	# NeMo may return timestep as a Tensor, list, dict, or None
	if timestep is None:
	raise ValueError("timestep is None — will use fallback")

	# If it's a dict (e.g. {'timestep': [...], 'word': [...]})
	if isinstance(timestep, dict):
	timestep = timestep.get('timestep') or timestep.get('word') or list(timestep.values())[0]

	# Convert to numpy array
	if isinstance(timestep, torch.Tensor):
	timestep_array = timestep.cpu().numpy()
	else:
	timestep_array = np.array(timestep, dtype=np.float32)

	if len(timestep_array) == 0:
	raise ValueError("timestep array is empty — will use fallback")

	# Parakeet TDT uses 8x encoder subsampling (4x conv + 2x additional)
	# so each encoder frame = 8 * 10ms = 80ms of real audio time
	frame_duration = 0.08
	words_per_segment = 15

	frames_per_word = len(timestep_array) / len(words)
	word_timestamps = []

	for i, word in enumerate(words):
	start_frame = int(i * frames_per_word)
	end_frame = int((i + 1) * frames_per_word)
	start_frame = min(start_frame, len(timestep_array) - 1)
	end_frame = min(end_frame, len(timestep_array))

	if end_frame > start_frame:
	actual_start_frame = int(timestep_array[start_frame])
	actual_end_frame = int(timestep_array[min(end_frame - 1, len(timestep_array) - 1)])
	else:
	actual_start_frame = int(timestep_array[start_frame])
	actual_end_frame = actual_start_frame

	word_timestamps.append({
	'word': word,
	'start': round(actual_start_frame * frame_duration, 3),
	'end': round(actual_end_frame * frame_duration, 3),
	})

	# Group into segments of words_per_segment words each
	for i in range(0, len(word_timestamps), words_per_segment):
	seg = word_timestamps[i:i + words_per_segment]
	if seg:
	segments.append({
	'text': ' '.join(w['word'] for w in seg),
	'start': seg[0]['start'],
	'end': seg[-1]['end'],
	})


	except Exception as e:
	print(f"Warning: Timestamp extraction failed: {e}")
	# Fallback segments method
	words = full_text.split()
	words_per_segment = 15
	segment_duration = duration / max(1, len(words) / words_per_segment)

	for i in range(0, len(words), words_per_segment):
	segment_words = words[i:i + words_per_segment]
	segment_idx = i // words_per_segment
	segments.append({
	'text': ' '.join(segment_words),
	'start': round(segment_idx * segment_duration, 3),
	'end': round((segment_idx + 1) * segment_duration, 3)
	})

	# 4. Construct response identical to local backend
	word_count = len(full_text.split())

	result = {
	"success": True,
	"text": full_text,
	"segment_timestamps": segments,
	"model": "nvidia/parakeet-tdt-0.6b-v3",
	"device": "cuda" if torch.cuda.is_available() else "cpu",
	"audio_duration": round(duration, 2),
	"word_count": word_count,
	# DEBUG: remove once timestamps are confirmed working
	"debug_info": {
	"has_timestep": hasattr(hypothesis, 'timestep'),
	"timestep_type": str(type(getattr(hypothesis, 'timestep', None))),
	"timestep_len": len(getattr(hypothesis, 'timestep', [])) if hasattr(hypothesis, 'timestep') else 0,
	"timestep_sample": str(getattr(hypothesis, 'timestep', None))[:100] if hasattr(hypothesis, 'timestep') else None,
	"has_words": hasattr(hypothesis, 'words'),
	"cuda_available": torch.cuda.is_available(),
	"segments_from_fallback": len(segments) > 0 and segments[-1]['end'] > 190,
	}
	}


	# Return as pretty JSON string for the Gradio UI
	return json.dumps(result, indent=2)

	except Exception as e:
	import traceback
	traceback.print_exc()
	return json.dumps({
	"success": False,
	"error": str(e)
	}, indent=2)

	# 5. GRADIO INTERFACE
	# For API usage, we can skip the UI, but we'll leave it for debugging
	demo = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(type="filepath", label="Upload Audio"),
	outputs=gr.Code(language="json", label="Transcription Result JSON"), # Changed to Code/JSON for better formatting
	title="Ali's FYP: Parakeet ASR Engine",
	description="Running on NVIDIA A100 (ZeroGPU). API-ready endpoint.",
	api_name="transcribe" # Allows api requests to /api/transcribe
	)

	if __name__ == "__main__":
	demo.launch()