Spaces:

prachi1507
/

AudioTranscriberTranslator15

Runtime error

App Files Files Community

AudioTranscriberTranslator15 / app.py

prachi1507

create app.py

3909dfe verified 7 months ago

raw

history blame contribute delete

16.7 kB

	import streamlit as st
	import whisper
	import tempfile
	import os
	import torch
	from datetime import datetime
	import warnings
	import gc

	# Suppress warnings
	warnings.filterwarnings("ignore")

	# Configure Streamlit page
	st.set_page_config(
	page_title="Audio Transcriber & Translator",
	page_icon="🎵",
	layout="centered"
	)

	# Custom CSS for better UI
	st.markdown("""
	<style>
	.main-header {
	text-align: center;
	padding: 2rem 0;
	background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
	color: white;
	border-radius: 10px;
	margin-bottom: 2rem;
	}
	.result-section {
	background: #f8f9fa;
	padding: 1.5rem;
	border-radius: 10px;
	margin: 1rem 0;
	border-left: 4px solid #667eea;
	}
	.download-section {
	background: #e8f5e8;
	padding: 1.5rem;
	border-radius: 10px;
	margin-top: 1.5rem;
	text-align: center;
	}
	.language-badge {
	background: #667eea;
	color: white;
	padding: 0.5rem 1rem;
	border-radius: 20px;
	font-weight: bold;
	display: inline-block;
	margin-bottom: 1rem;
	}
	.warning-box {
	background: #fff3cd;
	border: 1px solid #ffeaa7;
	padding: 1rem;
	border-radius: 8px;
	margin: 1rem 0;
	}
	</style>
	""", unsafe_allow_html=True)

	class M2M100Translator:
	def __init__(self):
	self.model_name = "facebook/m2m100_418M"
	self.tokenizer = None
	self.model = None

	# M2M100 language codes
	self.supported_languages = {
	'af': 'Afrikaans', 'ar': 'Arabic', 'bg': 'Bulgarian', 'bn': 'Bengali',
	'ca': 'Catalan', 'cs': 'Czech', 'da': 'Danish', 'de': 'German',
	'el': 'Greek', 'en': 'English', 'es': 'Spanish', 'et': 'Estonian',
	'fa': 'Persian', 'fi': 'Finnish', 'fr': 'French', 'gu': 'Gujarati',
	'he': 'Hebrew', 'hi': 'Hindi', 'hr': 'Croatian', 'hu': 'Hungarian',
	'id': 'Indonesian', 'it': 'Italian', 'ja': 'Japanese', 'ka': 'Georgian',
	'kk': 'Kazakh', 'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean',
	'lt': 'Lithuanian', 'lv': 'Latvian', 'mk': 'Macedonian', 'ml': 'Malayalam',
	'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay', 'my': 'Myanmar',
	'ne': 'Nepali', 'nl': 'Dutch', 'no': 'Norwegian', 'pl': 'Polish',
	'pt': 'Portuguese', 'ro': 'Romanian', 'ru': 'Russian', 'si': 'Sinhala',
	'sk': 'Slovak', 'sl': 'Slovenian', 'sq': 'Albanian', 'sr': 'Serbian',
	'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil', 'te': 'Telugu',
	'th': 'Thai', 'tl': 'Tagalog', 'tr': 'Turkish', 'uk': 'Ukrainian',
	'ur': 'Urdu', 'vi': 'Vietnamese', 'zh': 'Chinese'
	}

	def load_model(self):
	if self.model is None:
	try:
	from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

	with st.spinner("🔄 Loading M2M100 translation model..."):
	# Load tokenizer and model - simplified for HF Spaces
	self.tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
	self.model = M2M100ForConditionalGeneration.from_pretrained(
	self.model_name,
	torch_dtype=torch.float32 # Use float32 for CPU compatibility
	)

	st.success("✅ Translation model loaded successfully!")

	except Exception as e:
	st.error(f"❌ Failed to load translation model: {str(e)}")
	st.info("💡 Translation will be skipped. You can still get transcripts.")
	return False
	return True

	def get_language_name(self, lang_code):
	return self.supported_languages.get(lang_code, lang_code.upper())

	def translate_text(self, text, source_language):
	if not text or not text.strip():
	return {"success": False, "error": "Empty text provided"}

	# If already English, return as is
	if source_language == 'en':
	return {
	"success": True,
	"original_text": text,
	"translated_text": text,
	"source_language": source_language,
	"note": "Source is already English"
	}

	# Check if source language is supported
	if source_language not in self.supported_languages:
	return {
	"success": False,
	"error": f"Language '{source_language}' not supported",
	"original_text": text,
	"source_language": source_language
	}

	if not self.load_model():
	return {
	"success": False,
	"error": "Translation model not available",
	"original_text": text,
	"source_language": source_language
	}

	try:
	# Set source language
	self.tokenizer.src_lang = source_language

	# Tokenize input with length limits for HF Spaces
	inputs = self.tokenizer(
	text,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=200 # Reduced for faster processing
	)

	# Generate translation
	with torch.no_grad():
	generated_tokens = self.model.generate(
	**inputs,
	forced_bos_token_id=self.tokenizer.get_lang_id("en"),
	max_length=250,
	num_beams=2, # Reduced beams for speed
	early_stopping=True,
	do_sample=False
	)

	# Decode translation
	translated_text = self.tokenizer.batch_decode(
	generated_tokens,
	skip_special_tokens=True
	)[0]

	# Clear memory
	del inputs, generated_tokens
	gc.collect()

	return {
	"success": True,
	"original_text": text,
	"translated_text": translated_text.strip(),
	"source_language": source_language,
	"model_used": self.model_name
	}

	except Exception as e:
	return {
	"success": False,
	"error": str(e),
	"original_text": text,
	"source_language": source_language
	}

	@st.cache_resource
	def load_whisper_model():
	"""Load Whisper model with caching - optimized for HF Spaces"""
	try:
	# Use tiny model for faster loading and processing on HF Spaces
	model = whisper.load_model("tiny")
	return model
	except Exception as e:
	st.error(f"Failed to load Whisper model: {e}")
	return None

	@st.cache_resource
	def load_translator():
	"""Load translator with caching"""
	return M2M100Translator()

	def transcribe_audio(audio_file):
	"""Transcribe uploaded audio file"""
	try:
	# Create temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	tmp_file.write(audio_file.read())
	tmp_file_path = tmp_file.name

	model = load_whisper_model()
	if model is None:
	return {"success": False, "error": "Whisper model not available"}

	# Transcribe with optimized settings for HF Spaces
	result = model.transcribe(
	tmp_file_path,
	fp16=False, # Use fp32 for better compatibility
	task="transcribe"
	)

	# Clean up
	os.unlink(tmp_file_path)
	gc.collect()

	return {
	"success": True,
	"transcript": result["text"].strip(),
	"language": result["language"]
	}

	except Exception as e:
	if 'tmp_file_path' in locals():
	try:
	os.unlink(tmp_file_path)
	except:
	pass
	return {"success": False, "error": str(e)}

	def main():
	# Header
	st.markdown("""
	<div class="main-header">
	<h1>🎵 Audio Transcriber & Translator</h1>
	<p>Upload audio files and get transcripts with English translation</p>
	<small>Optimized for Hugging Face Spaces</small>
	</div>
	""", unsafe_allow_html=True)

	# HF Spaces notice
	st.markdown("""
	<div class="warning-box">
	<strong>🚀 Hugging Face Spaces Version</strong><br>
	• Using Whisper-tiny for faster processing<br>
	• File limit: 10MB, Duration: 5 minutes<br>
	• Processing may take 1-2 minutes
	</div>
	""", unsafe_allow_html=True)

	# Show system info in sidebar
	with st.sidebar:
	st.header("🔧 System Info")
	st.info("Running on Hugging Face Spaces")
	st.info(f"PyTorch: {torch.__version__}")
	st.warning("Using CPU (optimized for HF Spaces)")

	st.header("🌍 Models")
	st.info("• Whisper: tiny (fast)")
	st.info("• Translation: M2M100-418M")

	with st.expander("💡 Tips"):
	st.caption("• Use shorter audio files (< 5 min)")
	st.caption("• MP3/WAV work best")
	st.caption("• Clear speech gives better results")
	st.caption("• Processing takes 1-2 minutes")

	# File uploader with restrictions for HF Spaces
	uploaded_file = st.file_uploader(
	"🎵 Choose an audio file",
	type=['mp3', 'wav', 'mp4', 'm4a'],
	help="Supported: MP3, WAV, MP4, M4A \| Max: 10MB, 5 minutes"
	)

	if uploaded_file is not None:
	# File size check
	file_size_mb = uploaded_file.size / (1024 * 1024)

	if file_size_mb > 10:
	st.error("❌ File too large! Please use files under 10MB for optimal performance on HF Spaces.")
	return

	st.success(f"📁 {uploaded_file.name} ({file_size_mb:.2f} MB)")

	# Processing options
	col1, col2 = st.columns(2)
	with col1:
	transcribe_only = st.checkbox("Transcribe only (faster)", value=False)
	with col2:
	if st.button("🧹 Clear Cache", help="Clear models from memory"):
	st.cache_resource.clear()
	st.success("Cache cleared!")

	# Process button
	if st.button("🚀 Process Audio", type="primary", use_container_width=True):
	start_time = datetime.now()

	# Step 1: Transcription
	with st.spinner("🎤 Transcribing audio... (this may take 1-2 minutes)"):
	transcription_result = transcribe_audio(uploaded_file)

	if transcription_result["success"]:
	transcript = transcription_result["transcript"]
	detected_language = transcription_result["language"]

	# Get language name
	translator = load_translator()
	language_name = translator.get_language_name(detected_language)

	# Display transcription results
	st.markdown("""
	<div class="result-section">
	<h3>📝 Transcription Results</h3>
	</div>
	""", unsafe_allow_html=True)

	# Language badge
	st.markdown(f"""
	<div class="language-badge">
	🌍 Detected: {language_name} ({detected_language})
	</div>
	""", unsafe_allow_html=True)

	# Transcript
	st.text_area(
	"Original Transcript",
	transcript,
	height=150,
	key="transcript"
	)

	# Step 2: Translation (if requested)
	if not transcribe_only and detected_language != 'en':
	with st.spinner("🌍 Translating to English..."):
	translation_result = translator.translate_text(transcript, detected_language)

	if translation_result["success"]:
	translated_text = translation_result["translated_text"]

	st.markdown("""
	<div class="result-section">
	<h3>🌍 English Translation</h3>
	</div>
	""", unsafe_allow_html=True)

	st.text_area(
	"English Translation",
	translated_text,
	height=150,
	key="translation"
	)

	# Download section
	st.markdown("""
	<div class="download-section">
	<h4>📥 Download Results</h4>
	</div>
	""", unsafe_allow_html=True)

	# Prepare download content
	full_content = f"""Audio Transcription & Translation
	{'='*60}
	File: {uploaded_file.name}
	Size: {file_size_mb:.2f} MB
	Detected Language: {language_name} ({detected_language})
	Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds
	Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
	{'='*60}

	ORIGINAL TRANSCRIPT ({language_name}):
	{transcript}

	ENGLISH TRANSLATION:
	{translated_text}

	{'='*60}
	Processed with Whisper (tiny) + M2M100 on Hugging Face Spaces
	"""

	st.download_button(
	"📄 Download Complete Results",
	full_content,
	file_name=f"{os.path.splitext(uploaded_file.name)[0]}_results.txt",
	mime="text/plain",
	use_container_width=True
	)

	else:
	st.error(f"❌ Translation failed: {translation_result['error']}")
	# Still offer transcript download
	transcript_content = f"""Audio Transcription
	{'='*50}
	File: {uploaded_file.name}
	Language: {language_name} ({detected_language})
	Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
	{'='*50}

	{transcript}
	"""
	st.download_button(
	"📄 Download Transcript",
	transcript_content,
	file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt",
	mime="text/plain"
	)

	elif transcribe_only or detected_language == 'en':
	# Transcript only
	transcript_content = f"""Audio Transcription
	{'='*50}
	File: {uploaded_file.name}
	Language: {language_name} ({detected_language})
	Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds
	Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
	{'='*50}

	{transcript}
	"""
	st.download_button(
	"📄 Download Transcript",
	transcript_content,
	file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt",
	mime="text/plain",
	use_container_width=True
	)

	# Show processing time
	processing_time = (datetime.now() - start_time).total_seconds()
	st.success(f"✅ Processing completed in {processing_time:.1f} seconds")

	else:
	st.error(f"❌ Transcription failed: {transcription_result['error']}")
	st.info("💡 Try with a different audio file or format")

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style="text-align: center; color: #666; padding: 1rem;">
	<p>🎵 Powered by OpenAI Whisper & Facebook M2M100</p>
	<p>Running on Hugging Face Spaces 🤗</p>
	</div>
	""", unsafe_allow_html=True)

	if __name__ == "__main__":
	main()