Spaces:
Sleeping
Sleeping
| """ | |
| Transcribe one or more audio files using a fine-tuned (or base) Whisper model. | |
| Usage: | |
| # Use the fine-tuned model | |
| python scripts/transcribe.py audio.mp3 | |
| # Transcribe multiple files | |
| python scripts/transcribe.py file1.mp3 file2.wav | |
| # Use a different model (HF model ID or local path) | |
| python scripts/transcribe.py --model openai/whisper-large-v3 audio.mp3 | |
| python scripts/transcribe.py --model outputs/checkpoints/best_model audio.mp3 | |
| # Save output to a file | |
| python scripts/transcribe.py audio.mp3 --output result.txt | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import logging | |
| import sys | |
| from pathlib import Path | |
| import os | |
| from dotenv import load_dotenv | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from src.inference.transcribe import WhisperTranscriber | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s %(levelname)-8s %(message)s", | |
| datefmt="%H:%M:%S", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| DEFAULT_MODEL = "outputs/checkpoints/best_model" | |
| def main(): | |
| root = Path(__file__).parent.parent | |
| load_dotenv(root / ".env") | |
| parser = argparse.ArgumentParser(description="Transcribe Arabic audio with Whisper") | |
| parser.add_argument("audio", nargs="+", help="Audio file(s) to transcribe") | |
| parser.add_argument( | |
| "--model", | |
| default=str(root / DEFAULT_MODEL), | |
| help=f"Model path or HF model ID (default: {DEFAULT_MODEL})", | |
| ) | |
| parser.add_argument("--output", default=None, help="Write transcription to this file") | |
| parser.add_argument("--device", default=None, help="cuda or cpu (auto-detect if omitted)") | |
| parser.add_argument("--diarize", action="store_true", help="Use Pyannote to diarize conversation turns") | |
| parser.add_argument("--hf-token", default=os.environ.get("HF_TOKEN"), help="HuggingFace token for Pyannote") | |
| parser.add_argument("--analyze", action="store_true", help="Post-process transcript with Gemini 2.5 Flash") | |
| parser.add_argument("--api-key", default=None, help="Gemini API Key (or set GEMINI_API_KEY env var)") | |
| args = parser.parse_args() | |
| model_path = args.model | |
| # Fall back to base model if fine-tuned one doesn't exist yet | |
| if not Path(model_path).exists() and not model_path.startswith("openai/"): | |
| logger.warning( | |
| "Fine-tuned model not found at %s — falling back to openai/whisper-large-v3", | |
| model_path, | |
| ) | |
| model_path = "openai/whisper-large-v3" | |
| transcriber = WhisperTranscriber(model_path=model_path, device=args.device) | |
| analyzer = None | |
| if args.analyze: | |
| try: | |
| from src.inference.analyze_call import CallAnalyzer | |
| analyzer = CallAnalyzer(api_key=args.api_key) | |
| logger.info("CallAnalyzer initialized with Gemini 2.5 Flash.") | |
| except Exception as e: | |
| logger.error("Failed to initialize CallAnalyzer: %s", e) | |
| sys.exit(1) | |
| results = [] | |
| for audio_path in args.audio: | |
| logger.info("Transcribing %s ...", audio_path) | |
| if args.diarize: | |
| if not args.hf_token: | |
| logger.error("--hf-token or HF_TOKEN in .env is required for diarization") | |
| sys.exit(1) | |
| text = transcriber.transcribe_with_diarization(audio_path, args.hf_token) | |
| else: | |
| text = transcriber.transcribe(audio_path) | |
| analysis_result = None | |
| if analyzer: | |
| logger.info("Analyzing transcript for %s ...", audio_path) | |
| try: | |
| analysis = analyzer.analyze(text) | |
| analysis_result = analysis.model_dump_json(indent=2) | |
| except Exception as e: | |
| logger.error("Failed to analyze transcript: %s", e) | |
| results.append((audio_path, text, analysis_result)) | |
| print(f"\n=== {Path(audio_path).name} ===") | |
| print(f"Raw Transcript:\n{text}") | |
| if analysis_result: | |
| print(f"\nAnalysis:\n{analysis_result}") | |
| if args.output: | |
| out_path = Path(args.output) | |
| with out_path.open("w", encoding="utf-8") as fh: | |
| for path, text, analysis in results: | |
| fh.write(f"=== {Path(path).name} ===\nRaw Transcript:\n{text}\n\n") | |
| if analysis: | |
| fh.write(f"Analysis:\n{analysis}\n\n") | |
| logger.info("Output written to %s", out_path) | |
| if __name__ == "__main__": | |
| main() | |