File size: 5,708 Bytes
7ee2bc7
 
 
 
 
 
 
 
 
 
df8ba4d
 
 
7ee2bc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df8ba4d
7ee2bc7
 
 
 
 
 
 
 
df8ba4d
7ee2bc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df8ba4d
7ee2bc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df8ba4d
 
7ee2bc7
df8ba4d
7ee2bc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df8ba4d
7ee2bc7
df8ba4d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Transcriptinator - HuggingFace Spaces Gradio Interface
Audio transcription with Gemini + OpenRouter
"""

import gradio as gr
import os
from transcribe_core import process_audio_file, get_audio_duration
from ai_providers import GeminiProvider, OpenRouterProvider

# Establish absolute paths for Hugging Face Spaces compatibility
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
OUTPUT_FOLDER = os.path.join(CURRENT_DIR, "outputs")

def transcribe_audio(audio_file, gemini_key, openrouter_key, model_name):
    """
    Main transcription function for Gradio interface.
    """
    if not audio_file:
        return "❌ Please upload an audio file.", None
    
    if not gemini_key or len(gemini_key.strip()) < 10:
        return "❌ Please provide a valid Gemini API key.", None
    
    try:
        # Create Gemini provider for transcription
        gemini_provider = GeminiProvider(gemini_key, model_name)
        
        # Create OpenRouter provider for summary/ideas (optional)
        openrouter_provider = None
        if openrouter_key and len(openrouter_key.strip()) > 10:
            openrouter_provider = OpenRouterProvider(openrouter_key)
        
        # Get audio duration and file size for estimate
        duration = get_audio_duration(audio_file)
        duration_min = duration / 60
        file_size_mb = os.path.getsize(audio_file) / (1024 * 1024)
        
        # Process the audio file - ensure this function in core uses absolute paths
        output_path, is_zip = process_audio_file(
            audio_file,
            gemini_provider,
            openrouter_provider,
            progress_callback=lambda msg, progress: None
        )
        
        # Determine file type for success message
        if str(is_zip) == "True":
            file_type = "ZIP archive"
            file_desc = "Multiple transcript files (chunked audio)"
        else:
            file_type = "Markdown file"
            file_desc = "Single transcript file"
        
        text_provider = "OpenRouter (DeepSeek R1)" if openrouter_provider else "Gemini"
        
        success_msg = f"""βœ… **Transcription Complete!**

πŸ“ Original file: {os.path.basename(audio_file)}
⏱️ Duration: {duration_min:.1f} minutes
πŸ’Ύ Size: {file_size_mb:.1f} MB
πŸŽ™οΈ Transcription: Gemini ({model_name})
πŸ’‘ Summary/Ideas: {text_provider}
πŸ“„ Output: {file_type}

{file_desc}

Click below to download your transcript(s)."""
        
        # Return the absolute file path - Gradio handles the download via proxy
        return success_msg, output_path
        
    except Exception as e:
        error_msg = f"""❌ **Error during transcription:**

{str(e)}

**Common issues:**
- Invalid API key
- Audio file too large or corrupted
- Network connection issues"""
        return error_msg, None


# Create Gradio interface
with gr.Blocks(title="Transcriptinator", theme=gr.themes.Soft()) as app:
    gr.Markdown("""
    # πŸŽ™οΈ Transcriptinator
    ### AI-Powered Audio Transcription
    
    **Powered by:** Gemini (transcription) + OpenRouter DeepSeek R1 (summarization)
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            # Audio upload
            audio_input = gr.Audio(
                label="Upload Audio File",
                type="filepath",
                sources=["upload"],
            )
            
            gr.Markdown("""
            **Supported formats:** MP3, WAV, M4A, OGG, FLAC, WEBM  
            **Large files (>30MB):** Automatically chunked and processed
            """)
            
            # Model selection
            model_dropdown = gr.Dropdown(
                choices=list(GeminiProvider.AVAILABLE_MODELS.keys()),
                value="Gemini 2.5 Flash",
                label="Gemini Model",
                info="Select which Gemini model to use for transcription"
            )
            
            # API keys
            gemini_key_input = gr.Textbox(
                label="Gemini API Key (Required)",
                placeholder="Enter your Gemini API key...",
                type="password",
                info="Get one free at: https://aistudio.google.com/app/apikey"
            )
            
            openrouter_key_input = gr.Textbox(
                label="OpenRouter API Key (Optional)",
                placeholder="Enter your OpenRouter key for better summaries...",
                type="password",
                info="Leave empty to use Gemini for all tasks | Get free at: https://openrouter.ai"
            )
            
            # Submit button
            submit_btn = gr.Button("πŸš€ Transcribe Audio", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            # Status output
            status_output = gr.Markdown(label="Status")
            
            # Download component - removed 'interactive=False' for better stability
            download_output = gr.File(label="πŸ“₯ Download Transcript")
    
    # Information section ... (remains unchanged)
    gr.Markdown("""
    ---
    ### 🎯 What you'll get:
    - πŸ“ **Full transcription** with timestamps and speaker detection
    - πŸ“Š **Summary** in 2-3 sentences
    - πŸ’‘ **Key ideas** with descriptions
    - πŸ“„ **Markdown file** ready to download
    """)
    
    # Connect the transcription function
    submit_btn.click(
        fn=transcribe_audio,
        inputs=[audio_input, gemini_key_input, openrouter_key_input, model_dropdown],
        outputs=[status_output, download_output]
    )

# Launch the app with queuing and allowed_paths for file access
if __name__ == "__main__":
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    app.queue().launch(allowed_paths=[OUTPUT_FOLDER])