Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python | |
| """ | |
| Annotate one audio file with Qwen2-Audio and save a sidecar JSON. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| from pathlib import Path | |
| from qwen_audio_captioning import ( | |
| DEFAULT_ANALYSIS_PROMPT, | |
| DEFAULT_LONG_ANALYSIS_PROMPT, | |
| DEFAULT_MODEL_ID, | |
| build_captioner, | |
| generate_track_annotation, | |
| read_prompt_file, | |
| ) | |
| def read_dotenv_value(path: str, key: str) -> str: | |
| p = Path(path) | |
| if not p.exists(): | |
| return "" | |
| for raw in p.read_text(encoding="utf-8").splitlines(): | |
| line = raw.strip() | |
| if not line or line.startswith("#") or "=" not in line: | |
| continue | |
| k, v = line.split("=", 1) | |
| if k.strip() == key: | |
| return v.strip().strip('"').strip("'") | |
| return "" | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Annotate a single audio file with Qwen2-Audio") | |
| parser.add_argument("--audio", required=True, help="Audio file path") | |
| parser.add_argument("--backend", default="hf_endpoint", choices=["local", "hf_endpoint"]) | |
| parser.add_argument("--model-id", default=DEFAULT_MODEL_ID) | |
| parser.add_argument("--endpoint-url", default=os.getenv("HF_QWEN_ENDPOINT_URL", "")) | |
| parser.add_argument("--token", default="") | |
| parser.add_argument("--device", default="auto", choices=["auto", "cuda", "cpu", "mps"]) | |
| parser.add_argument("--torch-dtype", default="auto", choices=["auto", "float16", "bfloat16", "float32"]) | |
| parser.add_argument("--prompt", default=DEFAULT_ANALYSIS_PROMPT) | |
| parser.add_argument("--prompt-file", default="") | |
| parser.add_argument("--include-long-analysis", action="store_true") | |
| parser.add_argument("--long-analysis-prompt", default=DEFAULT_LONG_ANALYSIS_PROMPT) | |
| parser.add_argument("--long-analysis-prompt-file", default="") | |
| parser.add_argument("--long-analysis-max-new-tokens", type=int, default=1200) | |
| parser.add_argument("--long-analysis-temperature", type=float, default=0.1) | |
| parser.add_argument("--segment-seconds", type=float, default=30.0) | |
| parser.add_argument("--overlap-seconds", type=float, default=2.0) | |
| parser.add_argument("--max-new-tokens", type=int, default=384) | |
| parser.add_argument("--temperature", type=float, default=0.1) | |
| parser.add_argument("--keep-raw-outputs", action="store_true") | |
| parser.add_argument("--output-json", default="", help="Output JSON path (default: audio sidecar)") | |
| args = parser.parse_args() | |
| audio_path = Path(args.audio) | |
| if not audio_path.is_file(): | |
| raise FileNotFoundError(f"Audio not found: {audio_path}") | |
| prompt = read_prompt_file(args.prompt_file) if args.prompt_file else args.prompt | |
| long_prompt = ( | |
| read_prompt_file(args.long_analysis_prompt_file) | |
| if args.long_analysis_prompt_file | |
| else args.long_analysis_prompt | |
| ) | |
| token = ( | |
| args.token | |
| or os.getenv("HF_TOKEN", "") | |
| or read_dotenv_value(".env", "HF_TOKEN") | |
| or read_dotenv_value(".env", "hf_token") | |
| ) | |
| captioner = build_captioner( | |
| backend=args.backend, | |
| model_id=args.model_id, | |
| endpoint_url=args.endpoint_url, | |
| token=token, | |
| device=args.device, | |
| torch_dtype=args.torch_dtype, | |
| ) | |
| sidecar = generate_track_annotation( | |
| audio_path=str(audio_path), | |
| captioner=captioner, | |
| prompt=prompt, | |
| segment_seconds=float(args.segment_seconds), | |
| overlap_seconds=float(args.overlap_seconds), | |
| max_new_tokens=int(args.max_new_tokens), | |
| temperature=float(args.temperature), | |
| keep_raw_outputs=bool(args.keep_raw_outputs), | |
| include_long_analysis=bool(args.include_long_analysis), | |
| long_analysis_prompt=long_prompt, | |
| long_analysis_max_new_tokens=int(args.long_analysis_max_new_tokens), | |
| long_analysis_temperature=float(args.long_analysis_temperature), | |
| ) | |
| out_path = Path(args.output_json) if args.output_json else audio_path.with_suffix(".json") | |
| out_path.write_text(json.dumps(sidecar, indent=2, ensure_ascii=False), encoding="utf-8") | |
| print( | |
| json.dumps( | |
| { | |
| "saved_to": str(out_path), | |
| "caption": sidecar.get("caption", ""), | |
| "bpm": sidecar.get("bpm"), | |
| "keyscale": sidecar.get("keyscale", ""), | |
| "duration": sidecar.get("duration"), | |
| "segment_count": sidecar.get("music_analysis", {}).get("segment_count"), | |
| }, | |
| indent=2, | |
| ensure_ascii=False, | |
| ) | |
| ) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |