Speach-To-Text / scripts /transcribe.py
MIP-Tech's picture
Deploy to HF Spaces
0db822c
"""
Transcribe one or more audio files using a fine-tuned (or base) Whisper model.
Usage:
# Use the fine-tuned model
python scripts/transcribe.py audio.mp3
# Transcribe multiple files
python scripts/transcribe.py file1.mp3 file2.wav
# Use a different model (HF model ID or local path)
python scripts/transcribe.py --model openai/whisper-large-v3 audio.mp3
python scripts/transcribe.py --model outputs/checkpoints/best_model audio.mp3
# Save output to a file
python scripts/transcribe.py audio.mp3 --output result.txt
"""
from __future__ import annotations
import argparse
import logging
import sys
from pathlib import Path
import os
from dotenv import load_dotenv
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.inference.transcribe import WhisperTranscriber
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
DEFAULT_MODEL = "outputs/checkpoints/best_model"
def main():
root = Path(__file__).parent.parent
load_dotenv(root / ".env")
parser = argparse.ArgumentParser(description="Transcribe Arabic audio with Whisper")
parser.add_argument("audio", nargs="+", help="Audio file(s) to transcribe")
parser.add_argument(
"--model",
default=str(root / DEFAULT_MODEL),
help=f"Model path or HF model ID (default: {DEFAULT_MODEL})",
)
parser.add_argument("--output", default=None, help="Write transcription to this file")
parser.add_argument("--device", default=None, help="cuda or cpu (auto-detect if omitted)")
parser.add_argument("--diarize", action="store_true", help="Use Pyannote to diarize conversation turns")
parser.add_argument("--hf-token", default=os.environ.get("HF_TOKEN"), help="HuggingFace token for Pyannote")
parser.add_argument("--analyze", action="store_true", help="Post-process transcript with Gemini 2.5 Flash")
parser.add_argument("--api-key", default=None, help="Gemini API Key (or set GEMINI_API_KEY env var)")
args = parser.parse_args()
model_path = args.model
# Fall back to base model if fine-tuned one doesn't exist yet
if not Path(model_path).exists() and not model_path.startswith("openai/"):
logger.warning(
"Fine-tuned model not found at %s — falling back to openai/whisper-large-v3",
model_path,
)
model_path = "openai/whisper-large-v3"
transcriber = WhisperTranscriber(model_path=model_path, device=args.device)
analyzer = None
if args.analyze:
try:
from src.inference.analyze_call import CallAnalyzer
analyzer = CallAnalyzer(api_key=args.api_key)
logger.info("CallAnalyzer initialized with Gemini 2.5 Flash.")
except Exception as e:
logger.error("Failed to initialize CallAnalyzer: %s", e)
sys.exit(1)
results = []
for audio_path in args.audio:
logger.info("Transcribing %s ...", audio_path)
if args.diarize:
if not args.hf_token:
logger.error("--hf-token or HF_TOKEN in .env is required for diarization")
sys.exit(1)
text = transcriber.transcribe_with_diarization(audio_path, args.hf_token)
else:
text = transcriber.transcribe(audio_path)
analysis_result = None
if analyzer:
logger.info("Analyzing transcript for %s ...", audio_path)
try:
analysis = analyzer.analyze(text)
analysis_result = analysis.model_dump_json(indent=2)
except Exception as e:
logger.error("Failed to analyze transcript: %s", e)
results.append((audio_path, text, analysis_result))
print(f"\n=== {Path(audio_path).name} ===")
print(f"Raw Transcript:\n{text}")
if analysis_result:
print(f"\nAnalysis:\n{analysis_result}")
if args.output:
out_path = Path(args.output)
with out_path.open("w", encoding="utf-8") as fh:
for path, text, analysis in results:
fh.write(f"=== {Path(path).name} ===\nRaw Transcript:\n{text}\n\n")
if analysis:
fh.write(f"Analysis:\n{analysis}\n\n")
logger.info("Output written to %s", out_path)
if __name__ == "__main__":
main()