emmajeed's picture
Update app.py
df8ba4d verified
"""
Transcriptinator - HuggingFace Spaces Gradio Interface
Audio transcription with Gemini + OpenRouter
"""
import gradio as gr
import os
from transcribe_core import process_audio_file, get_audio_duration
from ai_providers import GeminiProvider, OpenRouterProvider
# Establish absolute paths for Hugging Face Spaces compatibility
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
OUTPUT_FOLDER = os.path.join(CURRENT_DIR, "outputs")
def transcribe_audio(audio_file, gemini_key, openrouter_key, model_name):
"""
Main transcription function for Gradio interface.
"""
if not audio_file:
return "❌ Please upload an audio file.", None
if not gemini_key or len(gemini_key.strip()) < 10:
return "❌ Please provide a valid Gemini API key.", None
try:
# Create Gemini provider for transcription
gemini_provider = GeminiProvider(gemini_key, model_name)
# Create OpenRouter provider for summary/ideas (optional)
openrouter_provider = None
if openrouter_key and len(openrouter_key.strip()) > 10:
openrouter_provider = OpenRouterProvider(openrouter_key)
# Get audio duration and file size for estimate
duration = get_audio_duration(audio_file)
duration_min = duration / 60
file_size_mb = os.path.getsize(audio_file) / (1024 * 1024)
# Process the audio file - ensure this function in core uses absolute paths
output_path, is_zip = process_audio_file(
audio_file,
gemini_provider,
openrouter_provider,
progress_callback=lambda msg, progress: None
)
# Determine file type for success message
if str(is_zip) == "True":
file_type = "ZIP archive"
file_desc = "Multiple transcript files (chunked audio)"
else:
file_type = "Markdown file"
file_desc = "Single transcript file"
text_provider = "OpenRouter (DeepSeek R1)" if openrouter_provider else "Gemini"
success_msg = f"""βœ… **Transcription Complete!**
πŸ“ Original file: {os.path.basename(audio_file)}
⏱️ Duration: {duration_min:.1f} minutes
πŸ’Ύ Size: {file_size_mb:.1f} MB
πŸŽ™οΈ Transcription: Gemini ({model_name})
πŸ’‘ Summary/Ideas: {text_provider}
πŸ“„ Output: {file_type}
{file_desc}
Click below to download your transcript(s)."""
# Return the absolute file path - Gradio handles the download via proxy
return success_msg, output_path
except Exception as e:
error_msg = f"""❌ **Error during transcription:**
{str(e)}
**Common issues:**
- Invalid API key
- Audio file too large or corrupted
- Network connection issues"""
return error_msg, None
# Create Gradio interface
with gr.Blocks(title="Transcriptinator", theme=gr.themes.Soft()) as app:
gr.Markdown("""
# πŸŽ™οΈ Transcriptinator
### AI-Powered Audio Transcription
**Powered by:** Gemini (transcription) + OpenRouter DeepSeek R1 (summarization)
""")
with gr.Row():
with gr.Column(scale=2):
# Audio upload
audio_input = gr.Audio(
label="Upload Audio File",
type="filepath",
sources=["upload"],
)
gr.Markdown("""
**Supported formats:** MP3, WAV, M4A, OGG, FLAC, WEBM
**Large files (>30MB):** Automatically chunked and processed
""")
# Model selection
model_dropdown = gr.Dropdown(
choices=list(GeminiProvider.AVAILABLE_MODELS.keys()),
value="Gemini 2.5 Flash",
label="Gemini Model",
info="Select which Gemini model to use for transcription"
)
# API keys
gemini_key_input = gr.Textbox(
label="Gemini API Key (Required)",
placeholder="Enter your Gemini API key...",
type="password",
info="Get one free at: https://aistudio.google.com/app/apikey"
)
openrouter_key_input = gr.Textbox(
label="OpenRouter API Key (Optional)",
placeholder="Enter your OpenRouter key for better summaries...",
type="password",
info="Leave empty to use Gemini for all tasks | Get free at: https://openrouter.ai"
)
# Submit button
submit_btn = gr.Button("πŸš€ Transcribe Audio", variant="primary", size="lg")
with gr.Column(scale=1):
# Status output
status_output = gr.Markdown(label="Status")
# Download component - removed 'interactive=False' for better stability
download_output = gr.File(label="πŸ“₯ Download Transcript")
# Information section ... (remains unchanged)
gr.Markdown("""
---
### 🎯 What you'll get:
- πŸ“ **Full transcription** with timestamps and speaker detection
- πŸ“Š **Summary** in 2-3 sentences
- πŸ’‘ **Key ideas** with descriptions
- πŸ“„ **Markdown file** ready to download
""")
# Connect the transcription function
submit_btn.click(
fn=transcribe_audio,
inputs=[audio_input, gemini_key_input, openrouter_key_input, model_dropdown],
outputs=[status_output, download_output]
)
# Launch the app with queuing and allowed_paths for file access
if __name__ == "__main__":
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
app.queue().launch(allowed_paths=[OUTPUT_FOLDER])