LiamKhoaLe commited on
Commit
e75661e
·
1 Parent(s): 8c05d27

Rm FastAPI #2

Browse files
Files changed (1) hide show
  1. app.py +1 -587
app.py CHANGED
@@ -154,8 +154,6 @@ yt_transcribe = gr.Interface(
154
  with demo:
155
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
156
 
157
- demo = demo.queue()
158
-
159
  # ---------------- Gemini setup (flash-lite only) -----------------
160
  GEMINI_API_KEYS = [
161
  os.getenv("GEMINI_API_1"),
@@ -234,588 +232,4 @@ def summarize_with_gemini(text: str) -> str:
234
  combined = getattr(r2, "text", "") or combined
235
  return combined
236
 
237
- # ------------------------- FastAPI wiring -----------------------
238
-
239
- app = FastAPI(title="Whisper API", description="API for Whisper + Gemini")
240
- app.add_middleware(
241
- CORSMiddleware,
242
- allow_origins=["*"],
243
- allow_credentials=True,
244
- allow_methods=["*"],
245
- allow_headers=["*"],
246
- )
247
-
248
- @app.post("/transcribe")
249
- async def api_transcribe(file: UploadFile = File(...)):
250
- if file is None:
251
- return {"error": "No file provided", "success": False}
252
- with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as tmp:
253
- content = await file.read()
254
- tmp.write(content)
255
- path = tmp.name
256
- try:
257
- text = pipe(path, batch_size=BATCH_SIZE, return_timestamps=True)["text"]
258
- return {"text": text, "success": True}
259
- finally:
260
- if os.path.exists(path):
261
- os.unlink(path)
262
-
263
- @app.post("/transcribe_and_summarize")
264
- async def api_transcribe_and_summarize(file: UploadFile = File(...)):
265
- if file is None:
266
- return {"error": "No file provided", "success": False}
267
- with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as tmp:
268
- content = await file.read()
269
- tmp.write(content)
270
- path = tmp.name
271
- try:
272
- text = pipe(path, batch_size=BATCH_SIZE, return_timestamps=True)["text"]
273
- summary = summarize_with_gemini(text)
274
- return {"text": text, "summary": summary, "success": True}
275
- finally:
276
- if os.path.exists(path):
277
- os.unlink(path)
278
-
279
- @app.get("/health")
280
- async def health():
281
- return {"status": "healthy"}
282
-
283
- app = gr.mount_gradio_app(app, demo, path="/")
284
-
285
- if __name__ == "__main__":
286
- uvicorn.run(app, host="0.0.0.0", port=7860)
287
- import torch
288
- import tempfile
289
- import os
290
- import random
291
- import google.generativeai as genai
292
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
293
- from fastapi import FastAPI, File, UploadFile
294
- from fastapi.middleware.cors import CORSMiddleware
295
- import uvicorn
296
- from dotenv import load_dotenv
297
-
298
- # Load environment variables
299
- load_dotenv()
300
-
301
- # Initialize the model and processor globally
302
- model_id = "openai/whisper-large-v3-turbo"
303
- model = None
304
- processor = None
305
- pipe = None
306
-
307
- # Gemini API configuration
308
- GEMINI_API_KEYS = [
309
- os.getenv("GEMINI_API_1"),
310
- os.getenv("GEMINI_API_2"),
311
- os.getenv("GEMINI_API_3"),
312
- os.getenv("GEMINI_API_4"),
313
- os.getenv("GEMINI_API_5")
314
- ]
315
-
316
- # Filter out None values
317
- GEMINI_API_KEYS = [key for key in GEMINI_API_KEYS if key is not None]
318
- current_api_index = 0
319
-
320
- # Configure Gemini (use flash-lite only)
321
- if GEMINI_API_KEYS:
322
- genai.configure(api_key=GEMINI_API_KEYS[0])
323
- try:
324
- gemini_model = genai.GenerativeModel('gemini-2.5-flash-lite')
325
- except Exception:
326
- # If the exact alias is unavailable in this SDK version, fall back to the smallest flash variant
327
- gemini_model = genai.GenerativeModel('gemini-2.5-flash')
328
- else:
329
- gemini_model = None
330
-
331
- def get_next_gemini_api():
332
- """Round-robin rotation of Gemini API keys"""
333
- global current_api_index
334
- if not GEMINI_API_KEYS:
335
- return None
336
-
337
- api_key = GEMINI_API_KEYS[current_api_index]
338
- current_api_index = (current_api_index + 1) % len(GEMINI_API_KEYS)
339
- return api_key
340
-
341
- def _summarize_single(text: str):
342
- """Summarize text using Gemini API with round-robin"""
343
- if not text or not text.strip():
344
- return {"error": "No text to summarize", "success": False}
345
-
346
- if not GEMINI_API_KEYS or not gemini_model:
347
- return {"error": "Gemini API not configured", "success": False}
348
-
349
- try:
350
- # Get next API key and configure
351
- api_key = get_next_gemini_api()
352
- genai.configure(api_key=api_key)
353
-
354
- # System prompt for comprehensive summarization
355
- system_prompt = """You are an expert content summarizer. Your task is to create a comprehensive summary that:
356
-
357
- 1. PRESERVES all important details, key points, and main ideas
358
- 2. REMOVES unnecessary small talk, filler words, and repetitive content
359
- 3. MAINTAINS the original meaning and context
360
- 4. ORGANIZES information logically
361
- 5. KEEPS important conversations, decisions, and actionable items
362
- 6. REMOVES only truly irrelevant details like "um", "uh", repeated phrases, or off-topic tangents
363
-
364
- Guidelines:
365
- - Keep all factual information, names, dates, numbers, and important statements
366
- - Preserve the structure and flow of important conversations
367
- - Remove only filler words, stutters, and truly irrelevant content
368
- - Maintain professional tone while being concise
369
- - If it's a meeting or conversation, preserve all decisions and action items
370
- - If it's educational content, preserve all key concepts and examples
371
-
372
- Create a well-structured summary that captures the essence while removing noise."""
373
-
374
- # Create the prompt
375
- prompt = f"{system_prompt}\n\nPlease summarize the following transcribed content:\n\n{text}"
376
-
377
- # Generate summary
378
- response = gemini_model.generate_content(prompt)
379
-
380
- if response.text:
381
- return {
382
- "summary": response.text,
383
- "success": True,
384
- "original_length": len(text),
385
- "summary_length": len(response.text)
386
- }
387
- else:
388
- return {"error": "No summary generated", "success": False}
389
-
390
- except Exception as e:
391
- return {"error": f"Gemini API error: {str(e)}", "success": False}
392
-
393
-
394
- def summarize_with_gemini(text: str):
395
- """Chunk long text and summarize in parallel using Gemini flash-lite only."""
396
- if not text or not text.strip():
397
- return {"error": "No text to summarize", "success": False}
398
-
399
- # Determine token capacity conservatively; use SDK tokenizer if available
400
- max_chunk_tokens = 6000 # conservative limit for flash-lite
401
-
402
- def count_tokens(t: str) -> int:
403
- try:
404
- return genai.count_tokens(t).total_tokens # type: ignore[attr-defined]
405
- except Exception:
406
- # Heuristic: ~4 chars per token
407
- return max(1, len(t) // 4)
408
-
409
- # If within limit, summarize directly
410
- if count_tokens(text) <= max_chunk_tokens:
411
- return _summarize_single(text)
412
-
413
- # Otherwise chunk by paragraphs/sentences while respecting token budget
414
- import re
415
- segments = re.split(r"(\n\n+|\.\s+)", text)
416
- chunks = []
417
- current = []
418
- current_tok = 0
419
- for seg in segments:
420
- seg_tok = count_tokens(seg)
421
- if current_tok + seg_tok > max_chunk_tokens and current:
422
- chunks.append("".join(current))
423
- current = [seg]
424
- current_tok = seg_tok
425
- else:
426
- current.append(seg)
427
- current_tok += seg_tok
428
- if current:
429
- chunks.append("".join(current))
430
-
431
- from concurrent.futures import ThreadPoolExecutor, as_completed
432
- summaries = []
433
- errors = []
434
- with ThreadPoolExecutor(max_workers=min(5, len(chunks))) as ex:
435
- futs = {ex.submit(_summarize_single, ch): i for i, ch in enumerate(chunks)}
436
- for fut in as_completed(futs):
437
- res = fut.result()
438
- if res.get("success"):
439
- summaries.append(res["summary"])
440
- else:
441
- errors.append(res.get("error"))
442
-
443
- if not summaries:
444
- return {"error": "; ".join(errors) if errors else "Summary failed", "success": False}
445
-
446
- combined = "\n\n".join(summaries)
447
- # Optional second pass to tighten the combined summary if long
448
- if count_tokens(combined) > max_chunk_tokens:
449
- second = _summarize_single(combined)
450
- if second.get("success"):
451
- combined = second["summary"]
452
-
453
- return {
454
- "summary": combined,
455
- "success": True,
456
- "original_length": len(text),
457
- "summary_length": len(combined)
458
- }
459
-
460
- @spaces.GPU
461
- def load_model():
462
- """Load the Whisper model on GPU"""
463
- global model, processor, pipe
464
-
465
- device = "cuda:0" #if torch.cuda.is_available() else "cpu" # Enforce CUDA
466
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
467
-
468
- print(f"Loading model on device: {device}")
469
-
470
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
471
- model_id,
472
- dtype=torch_dtype,
473
- low_cpu_mem_usage=True,
474
- use_safetensors=True
475
- )
476
- model.to(device)
477
-
478
- processor = AutoProcessor.from_pretrained(model_id)
479
-
480
- pipe = pipeline(
481
- "automatic-speech-recognition",
482
- model=model,
483
- tokenizer=processor.tokenizer,
484
- feature_extractor=processor.feature_extractor,
485
- dtype=torch_dtype,
486
- device=device,
487
- )
488
-
489
- print("Model loaded successfully!")
490
- return True
491
-
492
- @spaces.GPU
493
- def transcribe_audio(audio_file):
494
- """Transcribe audio file using Whisper"""
495
- global pipe
496
-
497
- if pipe is None:
498
- return {"error": "Model not loaded. Please wait and try again."}
499
-
500
- try:
501
- print(f"[transcribe_audio] input={type(audio_file)}")
502
- # Handle different audio file formats
503
- if isinstance(audio_file, str):
504
- # File path
505
- print(f"[transcribe_audio] filepath={audio_file}")
506
- result = pipe(audio_file)
507
- else:
508
- # File object
509
- print(f"[transcribe_audio] fileobj name={getattr(audio_file,'name',None)}")
510
- result = pipe(audio_file.name)
511
-
512
- print(f"[transcribe_audio] success, received keys={list(result.keys())}")
513
- return {
514
- "text": result["text"],
515
- "success": True
516
- }
517
- except Exception as e:
518
- print(f"[transcribe_audio] ERROR: {e}")
519
- return {
520
- "error": f"Transcription failed: {str(e)}",
521
- "success": False
522
- }
523
-
524
- # Lazy init helper to avoid ZeroGPU warning
525
- def initialize_model():
526
- """Initialize Whisper pipeline on-demand within request/UI context."""
527
- global pipe
528
- if pipe is None:
529
- load_model()
530
-
531
-
532
- @spaces.GPU
533
- def zero_gpu_probe():
534
- """Minimal GPU-tagged function so ZeroGPU detects a GPU job at startup.
535
- It does nothing and is bound to a Gradio load event.
536
- """
537
- return "ready"
538
-
539
- # GPU-bound handler for Gradio button; ensures ZeroGPU detects usage
540
- @spaces.GPU
541
- def handle_transcribe_gr(audio_file):
542
- if audio_file is None:
543
- return (
544
- "❌ Please upload an audio file first.",
545
- "",
546
- gr.update(visible=False),
547
- gr.update(visible=False),
548
- gr.update(visible=False),
549
- gr.update(visible=False)
550
- )
551
- try:
552
- # Ensure pipeline is initialized
553
- initialize_model()
554
- print(f"[gradio] received filepath={audio_file}")
555
- res = transcribe_audio(audio_file)
556
- if res.get("success"):
557
- txt = res["text"]
558
- return (
559
- f"✅ Transcription completed! ({len(txt)} characters)",
560
- txt,
561
- gr.update(visible=True, value=txt),
562
- gr.update(visible=True), # show summarize button
563
- gr.update(visible=False), # hide summary output initially
564
- gr.update(visible=True) # show download button
565
- )
566
- return (
567
- f"❌ Error: {res.get('error','Unknown error')}",
568
- "",
569
- gr.update(visible=False),
570
- gr.update(visible=False),
571
- gr.update(visible=False),
572
- gr.update(visible=False)
573
- )
574
- except Exception as e:
575
- print(f"[gradio] handle_transcribe_gr ERROR: {e}")
576
- return (
577
- f"❌ Unexpected error: {str(e)}",
578
- "",
579
- gr.update(visible=False),
580
- gr.update(visible=False),
581
- gr.update(visible=False),
582
- gr.update(visible=False)
583
- )
584
-
585
- # Create the enhanced Gradio interface
586
- def create_interface():
587
- """Create an enhanced Gradio interface with proper layout"""
588
-
589
- with gr.Blocks(
590
- title="🎤 Whisper Large V3 Turbo",
591
- theme=gr.themes.Soft(),
592
- fill_width=True
593
- ) as demo:
594
-
595
- # Header
596
- gr.Markdown("# 🎤 Whisper Large V3 Turbo")
597
- gr.Markdown("*OpenAI's Fast Speech Recognition Model*")
598
-
599
- # Main content area
600
- with gr.Row():
601
- # Left column - Upload area
602
- with gr.Column(scale=2, min_width=400):
603
- gr.Markdown("## 📁 Upload Audio")
604
-
605
- audio_input = gr.Audio(
606
- sources=["upload", "microphone"],
607
- type="filepath",
608
- label="Audio File"
609
- )
610
-
611
- gr.Markdown("*Supports MP3, WAV, FLAC, M4A, MP4, AVI, MOV files*")
612
-
613
- with gr.Row():
614
- transcribe_btn = gr.Button(
615
- "🚀 Transcribe Audio",
616
- variant="primary",
617
- scale=2
618
- )
619
- clear_btn = gr.Button(
620
- "🗑️ Clear",
621
- scale=1
622
- )
623
-
624
- # Summary section
625
- gr.Markdown("## 📝 AI Summary")
626
- summarize_btn = gr.Button(
627
- "🤖 Summarize Content",
628
- variant="secondary",
629
- visible=False
630
- )
631
-
632
- # Right column - Instructions
633
- with gr.Column(scale=1, min_width=300):
634
- gr.Markdown("## 📋 Instructions")
635
- gr.Markdown("""
636
- 1. **Upload** an audio/video file or **record** directly
637
- 2. Click **Transcribe Audio** to process
638
- 3. View results below
639
- 4. **Download** the transcription
640
-
641
- ### ⚡ Features
642
- - 🚀 **Fast**: 4x faster than standard Whisper
643
- - 🌍 **Multilingual**: 99 languages supported
644
- - 🎯 **Accurate**: State-of-the-art recognition
645
- """)
646
-
647
- # Status
648
- status_text = gr.Textbox(
649
- label="📊 Status",
650
- value="Ready to transcribe! Upload an audio file and click 'Transcribe Audio'.",
651
- interactive=False
652
- )
653
-
654
- # Results section
655
- with gr.Row():
656
- with gr.Column(scale=2):
657
- transcription_output = gr.Textbox(
658
- label="📝 Transcription Result",
659
- lines=10,
660
- placeholder="Your transcription will appear here...",
661
- show_copy_button=True
662
- )
663
-
664
- with gr.Column(scale=2):
665
- summary_output = gr.Textbox(
666
- label="🤖 AI Summary",
667
- lines=10,
668
- placeholder="AI summary will appear here...",
669
- show_copy_button=True,
670
- visible=False
671
- )
672
-
673
- with gr.Column(scale=1):
674
- gr.Markdown("### 💾 Download")
675
- download_btn = gr.DownloadButton(
676
- label="📥 Download TXT",
677
- visible=False
678
- )
679
- download_summary_btn = gr.DownloadButton(
680
- label="📥 Download Summary",
681
- visible=False
682
- )
683
-
684
- gr.Markdown("""
685
- ### 📊 Model Info
686
- - **Model**: Whisper Large V3 Turbo
687
- - **Parameters**: 809M (optimized)
688
- - **Speed**: ~4x faster
689
- - **Languages**: 99 supported
690
- - **GPU**: ZeroGPU powered
691
- """)
692
-
693
- # Event handlers (GPU-bound)
694
- # (Handlers that run on GPU must be module-level functions to be detected by ZeroGPU)
695
-
696
- def summarize_content(transcription_text):
697
- """Summarize the transcription using Gemini"""
698
- if not transcription_text or not transcription_text.strip():
699
- return (
700
- "❌ No transcription to summarize.",
701
- gr.update(visible=False),
702
- gr.update(visible=False)
703
- )
704
-
705
- try:
706
- result = summarize_with_gemini(transcription_text)
707
-
708
- if result.get("success"):
709
- summary_text = result["summary"]
710
- original_len = result.get("original_length", 0)
711
- summary_len = result.get("summary_length", 0)
712
-
713
- return (
714
- f"✅ Summary completed! ({original_len} → {summary_len} characters)",
715
- summary_text,
716
- gr.update(visible=True),
717
- gr.update(visible=True) # Show summary download button
718
- )
719
- else:
720
- return (
721
- f"❌ Summary error: {result.get('error', 'Unknown error')}",
722
- "",
723
- gr.update(visible=False),
724
- gr.update(visible=False)
725
- )
726
-
727
- except Exception as e:
728
- return (
729
- f"❌ Unexpected error: {str(e)}",
730
- "",
731
- gr.update(visible=False),
732
- gr.update(visible=False)
733
- )
734
-
735
- def clear_all():
736
- """Clear all inputs and outputs"""
737
- return (
738
- "Ready to transcribe! Upload an audio file and click 'Transcribe Audio'.",
739
- None,
740
- "",
741
- gr.update(visible=False),
742
- gr.update(visible=False),
743
- gr.update(visible=False),
744
- gr.update(visible=False)
745
- )
746
-
747
- def create_download_file(transcription_text):
748
- """Create a downloadable text file"""
749
- if not transcription_text:
750
- return None
751
-
752
- import tempfile
753
- import os
754
-
755
- # Create temporary file
756
- temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False)
757
- temp_file.write(transcription_text)
758
- temp_file.close()
759
-
760
- return temp_file.name
761
-
762
- def create_summary_download_file(summary_text):
763
- """Create a downloadable summary file"""
764
- if not summary_text:
765
- return None
766
-
767
- import tempfile
768
- import os
769
-
770
- # Create temporary file
771
- temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='_summary.txt', delete=False)
772
- temp_file.write(summary_text)
773
- temp_file.close()
774
-
775
- return temp_file.name
776
-
777
- # Connect events
778
- transcribe_btn.click(
779
- fn=handle_transcribe_gr,
780
- inputs=[audio_input],
781
- outputs=[status_text, transcription_output, summarize_btn, summary_output, download_btn],
782
- show_progress=True
783
- )
784
-
785
- summarize_btn.click(
786
- fn=summarize_content,
787
- inputs=[transcription_output],
788
- outputs=[status_text, summary_output, download_summary_btn],
789
- show_progress=True
790
- )
791
-
792
- clear_btn.click(
793
- fn=clear_all,
794
- inputs=[],
795
- outputs=[status_text, audio_input, transcription_output, summarize_btn, summary_output, download_btn, download_summary_btn]
796
- )
797
-
798
- download_btn.click(
799
- fn=create_download_file,
800
- inputs=[transcription_output],
801
- outputs=[download_btn]
802
- )
803
-
804
- download_summary_btn.click(
805
- fn=create_summary_download_file,
806
- inputs=[summary_output],
807
- outputs=[download_summary_btn]
808
- )
809
-
810
- # Bind a load-time call to a @spaces.GPU function so ZeroGPU detects it
811
- # Use empty lists for inputs/outputs to satisfy older Gradio versions
812
- demo.load(fn=zero_gpu_probe, inputs=[], outputs=[])
813
-
814
- return demo
815
-
816
- # Create the enhanced interface and enable queuing (ZeroGPU-friendly)
817
- demo = create_interface()
818
- demo = demo.queue()
819
-
820
- if __name__ == "__main__":
821
- demo.launch()
 
154
  with demo:
155
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
156
 
 
 
157
  # ---------------- Gemini setup (flash-lite only) -----------------
158
  GEMINI_API_KEYS = [
159
  os.getenv("GEMINI_API_1"),
 
232
  combined = getattr(r2, "text", "") or combined
233
  return combined
234
 
235
+ demo.queue().launch()