Spaces:

BinKhoaLe1812
/

WhisperAPI

Running on Zero

App Files Files Community

LiamKhoaLe commited on Oct 29, 2025

Commit

e75661e

1 Parent(s): 8c05d27

Rm FastAPI #2

Browse files

Files changed (1) hide show

app.py +1 -587

app.py CHANGED Viewed

@@ -154,8 +154,6 @@ yt_transcribe = gr.Interface(
 with demo:
     gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
-demo = demo.queue()
 # ---------------- Gemini setup (flash-lite only) -----------------
 GEMINI_API_KEYS = [
     os.getenv("GEMINI_API_1"),
@@ -234,588 +232,4 @@ def summarize_with_gemini(text: str) -> str:
             combined = getattr(r2, "text", "") or combined
     return combined
-# ------------------------- FastAPI wiring -----------------------
-app = FastAPI(title="Whisper API", description="API for Whisper + Gemini")
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-@app.post("/transcribe")
-async def api_transcribe(file: UploadFile = File(...)):
-    if file is None:
-        return {"error": "No file provided", "success": False}
-    with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as tmp:
-        content = await file.read()
-        tmp.write(content)
-        path = tmp.name
-    try:
-        text = pipe(path, batch_size=BATCH_SIZE, return_timestamps=True)["text"]
-        return {"text": text, "success": True}
-    finally:
-        if os.path.exists(path):
-            os.unlink(path)
-@app.post("/transcribe_and_summarize")
-async def api_transcribe_and_summarize(file: UploadFile = File(...)):
-    if file is None:
-        return {"error": "No file provided", "success": False}
-    with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as tmp:
-        content = await file.read()
-        tmp.write(content)
-        path = tmp.name
-    try:
-        text = pipe(path, batch_size=BATCH_SIZE, return_timestamps=True)["text"]
-        summary = summarize_with_gemini(text)
-        return {"text": text, "summary": summary, "success": True}
-    finally:
-        if os.path.exists(path):
-            os.unlink(path)
-@app.get("/health")
-async def health():
-    return {"status": "healthy"}
-app = gr.mount_gradio_app(app, demo, path="/")
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)
-import torch
-import tempfile
-import os
-import random
-import google.generativeai as genai
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-from fastapi import FastAPI, File, UploadFile
-from fastapi.middleware.cors import CORSMiddleware
-import uvicorn
-from dotenv import load_dotenv
-# Load environment variables
-load_dotenv()
-# Initialize the model and processor globally
-model_id = "openai/whisper-large-v3-turbo"
-model = None
-processor = None
-pipe = None
-# Gemini API configuration
-GEMINI_API_KEYS = [
-    os.getenv("GEMINI_API_1"),
-    os.getenv("GEMINI_API_2"),
-    os.getenv("GEMINI_API_3"),
-    os.getenv("GEMINI_API_4"),
-    os.getenv("GEMINI_API_5")
-]
-# Filter out None values
-GEMINI_API_KEYS = [key for key in GEMINI_API_KEYS if key is not None]
-current_api_index = 0
-# Configure Gemini (use flash-lite only)
-if GEMINI_API_KEYS:
-    genai.configure(api_key=GEMINI_API_KEYS[0])
-    try:
-        gemini_model = genai.GenerativeModel('gemini-2.5-flash-lite')
-    except Exception:
-        # If the exact alias is unavailable in this SDK version, fall back to the smallest flash variant
-        gemini_model = genai.GenerativeModel('gemini-2.5-flash')
-else:
-    gemini_model = None
-def get_next_gemini_api():
-    """Round-robin rotation of Gemini API keys"""
-    global current_api_index
-    if not GEMINI_API_KEYS:
-        return None
-    api_key = GEMINI_API_KEYS[current_api_index]
-    current_api_index = (current_api_index + 1) % len(GEMINI_API_KEYS)
-    return api_key
-def _summarize_single(text: str):
-    """Summarize text using Gemini API with round-robin"""
-    if not text or not text.strip():
-        return {"error": "No text to summarize", "success": False}
-    if not GEMINI_API_KEYS or not gemini_model:
-        return {"error": "Gemini API not configured", "success": False}
-    try:
-        # Get next API key and configure
-        api_key = get_next_gemini_api()
-        genai.configure(api_key=api_key)
-        # System prompt for comprehensive summarization
-        system_prompt = """You are an expert content summarizer. Your task is to create a comprehensive summary that:
-1. PRESERVES all important details, key points, and main ideas
-2. REMOVES unnecessary small talk, filler words, and repetitive content
-3. MAINTAINS the original meaning and context
-4. ORGANIZES information logically
-5. KEEPS important conversations, decisions, and actionable items
-6. REMOVES only truly irrelevant details like "um", "uh", repeated phrases, or off-topic tangents
-Guidelines:
-- Keep all factual information, names, dates, numbers, and important statements
-- Preserve the structure and flow of important conversations
-- Remove only filler words, stutters, and truly irrelevant content
-- Maintain professional tone while being concise
-- If it's a meeting or conversation, preserve all decisions and action items
-- If it's educational content, preserve all key concepts and examples
-Create a well-structured summary that captures the essence while removing noise."""
-        # Create the prompt
-        prompt = f"{system_prompt}\n\nPlease summarize the following transcribed content:\n\n{text}"
-        # Generate summary
-        response = gemini_model.generate_content(prompt)
-        if response.text:
-            return {
-                "summary": response.text,
-                "success": True,
-                "original_length": len(text),
-                "summary_length": len(response.text)
-            }
-        else:
-            return {"error": "No summary generated", "success": False}
-    except Exception as e:
-        return {"error": f"Gemini API error: {str(e)}", "success": False}
-def summarize_with_gemini(text: str):
-    """Chunk long text and summarize in parallel using Gemini flash-lite only."""
-    if not text or not text.strip():
-        return {"error": "No text to summarize", "success": False}
-    # Determine token capacity conservatively; use SDK tokenizer if available
-    max_chunk_tokens = 6000  # conservative limit for flash-lite
-    def count_tokens(t: str) -> int:
-        try:
-            return genai.count_tokens(t).total_tokens  # type: ignore[attr-defined]
-        except Exception:
-            # Heuristic: ~4 chars per token
-            return max(1, len(t) // 4)
-    # If within limit, summarize directly
-    if count_tokens(text) <= max_chunk_tokens:
-        return _summarize_single(text)
-    # Otherwise chunk by paragraphs/sentences while respecting token budget
-    import re
-    segments = re.split(r"(\n\n+|\.\s+)", text)
-    chunks = []
-    current = []
-    current_tok = 0
-    for seg in segments:
-        seg_tok = count_tokens(seg)
-        if current_tok + seg_tok > max_chunk_tokens and current:
-            chunks.append("".join(current))
-            current = [seg]
-            current_tok = seg_tok
-        else:
-            current.append(seg)
-            current_tok += seg_tok
-    if current:
-        chunks.append("".join(current))
-    from concurrent.futures import ThreadPoolExecutor, as_completed
-    summaries = []
-    errors = []
-    with ThreadPoolExecutor(max_workers=min(5, len(chunks))) as ex:
-        futs = {ex.submit(_summarize_single, ch): i for i, ch in enumerate(chunks)}
-        for fut in as_completed(futs):
-            res = fut.result()
-            if res.get("success"):
-                summaries.append(res["summary"])
-            else:
-                errors.append(res.get("error"))
-    if not summaries:
-        return {"error": "; ".join(errors) if errors else "Summary failed", "success": False}
-    combined = "\n\n".join(summaries)
-    # Optional second pass to tighten the combined summary if long
-    if count_tokens(combined) > max_chunk_tokens:
-        second = _summarize_single(combined)
-        if second.get("success"):
-            combined = second["summary"]
-    return {
-        "summary": combined,
-        "success": True,
-        "original_length": len(text),
-        "summary_length": len(combined)
-    }
-@spaces.GPU
-def load_model():
-    """Load the Whisper model on GPU"""
-    global model, processor, pipe
-    device = "cuda:0" #if torch.cuda.is_available() else "cpu" # Enforce CUDA
-    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-    print(f"Loading model on device: {device}")
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        model_id,
-        dtype=torch_dtype,
-        low_cpu_mem_usage=True,
-        use_safetensors=True
-    )
-    model.to(device)
-    processor = AutoProcessor.from_pretrained(model_id)
-    pipe = pipeline(
-        "automatic-speech-recognition",
-        model=model,
-        tokenizer=processor.tokenizer,
-        feature_extractor=processor.feature_extractor,
-        dtype=torch_dtype,
-        device=device,
-    )
-    print("Model loaded successfully!")
-    return True
-@spaces.GPU
-def transcribe_audio(audio_file):
-    """Transcribe audio file using Whisper"""
-    global pipe
-    if pipe is None:
-        return {"error": "Model not loaded. Please wait and try again."}
-    try:
-        print(f"[transcribe_audio] input={type(audio_file)}")
-        # Handle different audio file formats
-        if isinstance(audio_file, str):
-            # File path
-            print(f"[transcribe_audio] filepath={audio_file}")
-            result = pipe(audio_file)
-        else:
-            # File object
-            print(f"[transcribe_audio] fileobj name={getattr(audio_file,'name',None)}")
-            result = pipe(audio_file.name)
-        print(f"[transcribe_audio] success, received keys={list(result.keys())}")
-        return {
-            "text": result["text"],
-            "success": True
-        }
-    except Exception as e:
-        print(f"[transcribe_audio] ERROR: {e}")
-        return {
-            "error": f"Transcription failed: {str(e)}",
-            "success": False
-        }
-# Lazy init helper to avoid ZeroGPU warning
-def initialize_model():
-    """Initialize Whisper pipeline on-demand within request/UI context."""
-    global pipe
-    if pipe is None:
-        load_model()
-@spaces.GPU
-def zero_gpu_probe():
-    """Minimal GPU-tagged function so ZeroGPU detects a GPU job at startup.
-    It does nothing and is bound to a Gradio load event.
-    """
-    return "ready"
-# GPU-bound handler for Gradio button; ensures ZeroGPU detects usage
-@spaces.GPU
-def handle_transcribe_gr(audio_file):
-    if audio_file is None:
-        return (
-            "❌ Please upload an audio file first.",
-            "",
-            gr.update(visible=False),
-            gr.update(visible=False),
-            gr.update(visible=False),
-            gr.update(visible=False)
-        )
-    try:
-        # Ensure pipeline is initialized
-        initialize_model()
-        print(f"[gradio] received filepath={audio_file}")
-        res = transcribe_audio(audio_file)
-        if res.get("success"):
-            txt = res["text"]
-            return (
-                f"✅ Transcription completed! ({len(txt)} characters)",
-                txt,
-                gr.update(visible=True, value=txt),
-                gr.update(visible=True),   # show summarize button
-                gr.update(visible=False),  # hide summary output initially
-                gr.update(visible=True)    # show download button
-            )
-        return (
-            f"❌ Error: {res.get('error','Unknown error')}",
-            "",
-            gr.update(visible=False),
-            gr.update(visible=False),
-            gr.update(visible=False),
-            gr.update(visible=False)
-        )
-    except Exception as e:
-        print(f"[gradio] handle_transcribe_gr ERROR: {e}")
-        return (
-            f"❌ Unexpected error: {str(e)}",
-            "",
-            gr.update(visible=False),
-            gr.update(visible=False),
-            gr.update(visible=False),
-            gr.update(visible=False)
-        )
-# Create the enhanced Gradio interface
-def create_interface():
-    """Create an enhanced Gradio interface with proper layout"""
-    with gr.Blocks(
-        title="🎤 Whisper Large V3 Turbo",
-        theme=gr.themes.Soft(),
-        fill_width=True
-    ) as demo:
-        # Header
-        gr.Markdown("# 🎤 Whisper Large V3 Turbo")
-        gr.Markdown("*OpenAI's Fast Speech Recognition Model*")
-        # Main content area
-        with gr.Row():
-            # Left column - Upload area
-            with gr.Column(scale=2, min_width=400):
-                gr.Markdown("## 📁 Upload Audio")
-                audio_input = gr.Audio(
-                    sources=["upload", "microphone"],
-                    type="filepath",
-                    label="Audio File"
-                )
-                gr.Markdown("*Supports MP3, WAV, FLAC, M4A, MP4, AVI, MOV files*")
-                with gr.Row():
-                    transcribe_btn = gr.Button(
-                        "🚀 Transcribe Audio",
-                        variant="primary",
-                        scale=2
-                    )
-                    clear_btn = gr.Button(
-                        "🗑️ Clear",
-                        scale=1
-                    )
-                # Summary section
-                gr.Markdown("## 📝 AI Summary")
-                summarize_btn = gr.Button(
-                    "🤖 Summarize Content",
-                    variant="secondary",
-                    visible=False
-                )
-            # Right column - Instructions
-            with gr.Column(scale=1, min_width=300):
-                gr.Markdown("## 📋 Instructions")
-                gr.Markdown("""
-                1. **Upload** an audio/video file or **record** directly
-                2. Click **Transcribe Audio** to process
-                3. View results below
-                4. **Download** the transcription
-                ### ⚡ Features
-                - 🚀 **Fast**: 4x faster than standard Whisper
-                - 🌍 **Multilingual**: 99 languages supported
-                - 🎯 **Accurate**: State-of-the-art recognition
-                """)
-        # Status
-        status_text = gr.Textbox(
-            label="📊 Status",
-            value="Ready to transcribe! Upload an audio file and click 'Transcribe Audio'.",
-            interactive=False
-        )
-        # Results section
-        with gr.Row():
-            with gr.Column(scale=2):
-                transcription_output = gr.Textbox(
-                    label="📝 Transcription Result",
-                    lines=10,
-                    placeholder="Your transcription will appear here...",
-                    show_copy_button=True
-                )
-            with gr.Column(scale=2):
-                summary_output = gr.Textbox(
-                    label="🤖 AI Summary",
-                    lines=10,
-                    placeholder="AI summary will appear here...",
-                    show_copy_button=True,
-                    visible=False
-                )
-            with gr.Column(scale=1):
-                gr.Markdown("### 💾 Download")
-                download_btn = gr.DownloadButton(
-                    label="📥 Download TXT",
-                    visible=False
-                )
-                download_summary_btn = gr.DownloadButton(
-                    label="📥 Download Summary",
-                    visible=False
-                )
-                gr.Markdown("""
-                ### 📊 Model Info
-                - **Model**: Whisper Large V3 Turbo
-                - **Parameters**: 809M (optimized)
-                - **Speed**: ~4x faster
-                - **Languages**: 99 supported
-                - **GPU**: ZeroGPU powered
-                """)
-        # Event handlers (GPU-bound)
-        # (Handlers that run on GPU must be module-level functions to be detected by ZeroGPU)
-        def summarize_content(transcription_text):
-            """Summarize the transcription using Gemini"""
-            if not transcription_text or not transcription_text.strip():
-                return (
-                    "❌ No transcription to summarize.",
-                    gr.update(visible=False),
-                    gr.update(visible=False)
-                )
-            try:
-                result = summarize_with_gemini(transcription_text)
-                if result.get("success"):
-                    summary_text = result["summary"]
-                    original_len = result.get("original_length", 0)
-                    summary_len = result.get("summary_length", 0)
-                    return (
-                        f"✅ Summary completed! ({original_len} → {summary_len} characters)",
-                        summary_text,
-                        gr.update(visible=True),
-                        gr.update(visible=True)  # Show summary download button
-                    )
-                else:
-                    return (
-                        f"❌ Summary error: {result.get('error', 'Unknown error')}",
-                        "",
-                        gr.update(visible=False),
-                        gr.update(visible=False)
-                    )
-            except Exception as e:
-                return (
-                    f"❌ Unexpected error: {str(e)}",
-                    "",
-                    gr.update(visible=False),
-                    gr.update(visible=False)
-                )
-        def clear_all():
-            """Clear all inputs and outputs"""
-            return (
-                "Ready to transcribe! Upload an audio file and click 'Transcribe Audio'.",
-                None,
-                "",
-                gr.update(visible=False),
-                gr.update(visible=False),
-                gr.update(visible=False),
-                gr.update(visible=False)
-            )
-        def create_download_file(transcription_text):
-            """Create a downloadable text file"""
-            if not transcription_text:
-                return None
-            import tempfile
-            import os
-            # Create temporary file
-            temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False)
-            temp_file.write(transcription_text)
-            temp_file.close()
-            return temp_file.name
-        def create_summary_download_file(summary_text):
-            """Create a downloadable summary file"""
-            if not summary_text:
-                return None
-            import tempfile
-            import os
-            # Create temporary file
-            temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='_summary.txt', delete=False)
-            temp_file.write(summary_text)
-            temp_file.close()
-            return temp_file.name
-        # Connect events
-        transcribe_btn.click(
-            fn=handle_transcribe_gr,
-            inputs=[audio_input],
-            outputs=[status_text, transcription_output, summarize_btn, summary_output, download_btn],
-            show_progress=True
-        )
-        summarize_btn.click(
-            fn=summarize_content,
-            inputs=[transcription_output],
-            outputs=[status_text, summary_output, download_summary_btn],
-            show_progress=True
-        )
-        clear_btn.click(
-            fn=clear_all,
-            inputs=[],
-            outputs=[status_text, audio_input, transcription_output, summarize_btn, summary_output, download_btn, download_summary_btn]
-        )
-        download_btn.click(
-            fn=create_download_file,
-            inputs=[transcription_output],
-            outputs=[download_btn]
-        )
-        download_summary_btn.click(
-            fn=create_summary_download_file,
-            inputs=[summary_output],
-            outputs=[download_summary_btn]
-        )
-        # Bind a load-time call to a @spaces.GPU function so ZeroGPU detects it
-        # Use empty lists for inputs/outputs to satisfy older Gradio versions
-        demo.load(fn=zero_gpu_probe, inputs=[], outputs=[])
-    return demo
-# Create the enhanced interface and enable queuing (ZeroGPU-friendly)
-demo = create_interface()
-demo = demo.queue()
-if __name__ == "__main__":
-    demo.launch()

 with demo:
     gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
 # ---------------- Gemini setup (flash-lite only) -----------------
 GEMINI_API_KEYS = [
     os.getenv("GEMINI_API_1"),
             combined = getattr(r2, "text", "") or combined
     return combined
+demo.queue().launch()