Spaces:

Thanh-Lam
/

Diarization_labeling

Build error

App Files Files Community

Thanh-Lam commited on Dec 11, 2025

Commit

b924e1d

1 Parent(s): 0741d8d

.

Browse files

Files changed (16) hide show

README.md +32 -1
__pycache__/app.cpython-312.pyc +0 -0
__pycache__/infer.cpython-312.pyc +0 -0
app.py +36 -0
eval.py +0 -0
finetune.py +0 -0
infer.py +65 -0
params/eval.yaml +0 -0
params/finetune.yaml +0 -0
params/infer.yaml +0 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/models.cpython-312.pyc +0 -0
src/__pycache__/utils.cpython-312.pyc +0 -0
src/models.py +78 -0
src/utils.py +57 -0

README.md CHANGED Viewed

	@@ -1 +1,32 @@
1	- # Vietnamese_Diarization

+# Vietnamese_Diarization
+Kho mã mẫu diarization tiếng Việt dùng pyannote/speaker-diarization-community-1.
+## Yêu cầu
+- Python 3.10+
+- ffmpeg (bắt buộc cho torchcodec audio decoding)
+- Đã chấp nhận điều khoản model tại https://huggingface.co/pyannote/speaker-diarization-community-1
+- Hugging Face access token (dán vào hugging_face_key.txt hoặc đặt biến môi trường HUGGINGFACE_TOKEN/HUGGINGFACE_ACCESS_TOKEN)
+## Cài đặt nhanh
+- Cài thư viện: `pip install pyannote.audio` hoặc `uv add pyannote.audio`
+- Đảm bảo ffmpeg đã có trong PATH
+## Chạy mẫu
+- Diarization và in kết quả: `python infer.py path/to/audio.wav`
+- Lưu thêm RTTM: `python infer.py path/to/audio.wav --rttm outputs/audio.rttm`
+- Lưu JSON: `python infer.py path/to/audio.wav --json outputs/audio.json`
+- Chọn thiết bị: thêm `--device cpu` hoặc `--device cuda` (mặc định auto)
+## API Python
+```
+from app import diarize_file
+segments = diarize_file("audio.wav", device="auto")
+```
+## Cấu trúc
+- app.py: API Python đơn giản
+- infer.py: CLI chạy diarization
+- src/models.py: Bao gói pipeline pyannote
+- src/utils.py: Hỗ trợ đọc token, định dạng kết quả
+- hugging_face_key.txt: nơi dán Hugging Face access token (không commit token thật)

__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (1.82 kB). View file

__pycache__/infer.cpython-312.pyc ADDED Viewed

Binary file (3 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import List
+from src.models import DiarizationEngine, Segment
+def diarize_file(
+    audio_path: str | Path,
+    hf_token: str | None = None,
+    device: str = "auto",
+    show_progress: bool = True,
+) -> List[Segment]:
+    """API đơn giản để dùng trực tiếp trong Python."""
+    engine = DiarizationEngine(token=hf_token, device=device)
+    return engine.run(audio_path, show_progress=show_progress)
+if __name__ == "__main__":
+    # Ví dụ nhanh: python app.py audio.wav
+    import argparse
+    parser = argparse.ArgumentParser(description="Ví dụ chạy diarization qua hàm Python.")
+    parser.add_argument("audio", help="Đường dẫn tới file âm thanh")
+    parser.add_argument(
+        "--device",
+        choices=["auto", "cpu", "cuda"],
+        default="auto",
+        help="Thiết bị ưu tiên khi khởi tạo pipeline",
+    )
+    args = parser.parse_args()
+    segments = diarize_file(args.audio, device=args.device)
+    for idx, seg in enumerate(segments, start=1):
+        print(f"{idx:02d} | {seg.start:7.2f}s -> {seg.end:7.2f}s | speaker {seg.speaker}")

eval.py ADDED Viewed

File without changes

finetune.py ADDED Viewed

File without changes

infer.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from __future__ import annotations
+import argparse
+from pathlib import Path
+from src.models import DiarizationEngine
+from src.utils import export_segments_json, format_segments_table
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Chạy diarization bằng pyannote/speaker-diarization-community-1"
+    )
+    parser.add_argument("audio", help="Đường dẫn file âm thanh (wav, mp3, flac...)")
+    parser.add_argument(
+        "--hf-token",
+        dest="hf_token",
+        default=None,
+        help="Hugging Face access token, nếu bỏ trống sẽ đọc từ hugging_face_key.txt",
+    )
+    parser.add_argument(
+        "--device",
+        choices=["auto", "cpu", "cuda"],
+        default="auto",
+        help="Ưu tiên thiết bị chạy pipeline",
+    )
+    parser.add_argument(
+        "--no-progress",
+        action="store_true",
+        help="Tắt hiển thị tiến trình tải model/feature",
+    )
+    parser.add_argument(
+        "--rttm",
+        default=None,
+        help="Đường dẫn lưu file RTTM (tùy chọn)",
+    )
+    parser.add_argument(
+        "--json",
+        dest="json_out",
+        default=None,
+        help="Đường dẫn lưu kết quả dạng JSON (tùy chọn)",
+    )
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    engine = DiarizationEngine(token=args.hf_token, device=args.device)
+    diarization = engine.diarize(args.audio, show_progress=not args.no_progress)
+    segments = engine.to_segments(diarization)
+    print("Kết quả phân đoạn:")
+    print(format_segments_table([seg.__dict__ for seg in segments]))
+    if args.rttm:
+        rttm_path = engine.save_rttm(diarization, args.rttm)
+        print(f"Đã lưu RTTM tại: {rttm_path}")
+    if args.json_out:
+        json_path = export_segments_json([seg.__dict__ for seg in segments], args.json_out)
+        print(f"Đã lưu JSON tại: {json_path}")
+if __name__ == "__main__":
+    main()

params/eval.yaml ADDED Viewed

File without changes

params/finetune.yaml ADDED Viewed

File without changes

params/infer.yaml ADDED Viewed

File without changes

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (112 Bytes). View file

src/__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (4.52 kB). View file

src/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (3.29 kB). View file

src/models.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List
+import torch
+from pyannote.audio import Pipeline
+from pyannote.audio.pipelines.utils.hook import ProgressHook
+from .utils import ensure_audio_path, read_hf_token
+@dataclass
+class Segment:
+    start: float
+    end: float
+    speaker: str
+class DiarizationEngine:
+    """Bao gói pipeline diarization của pyannote."""
+    def __init__(
+        self,
+        model_id: str = "pyannote/speaker-diarization-community-1",
+        token: str | None = None,
+        key_path: str | Path = "hugging_face_key.txt",
+        device: str = "auto",
+    ) -> None:
+        self.device = self._resolve_device(device)
+        auth_token = read_hf_token(token, key_path)
+        self.pipeline = Pipeline.from_pretrained(model_id, token=auth_token)
+        self.pipeline.to(self.device)
+    @staticmethod
+    def _resolve_device(device: str) -> torch.device:
+        if device == "cpu":
+            return torch.device("cpu")
+        if device == "cuda":
+            if not torch.cuda.is_available():
+                raise RuntimeError("Yêu cầu CUDA nhưng không phát hiện GPU khả dụng.")
+            return torch.device("cuda")
+        if device == "auto":
+            return torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        raise ValueError("Giá trị device hợp lệ: auto, cpu, cuda.")
+    def diarize(self, audio_path: str | Path, show_progress: bool = True):
+        audio_path = ensure_audio_path(audio_path)
+        if show_progress:
+            with ProgressHook() as hook:
+                return self.pipeline(str(audio_path), hook=hook)
+        return self.pipeline(str(audio_path))
+    @staticmethod
+    def to_segments(diarization) -> List[Segment]:
+        segments: List[Segment] = []
+        for segment, _, speaker in diarization.itertracks(yield_label=True):
+            segments.append(
+                Segment(
+                    start=float(segment.start),
+                    end=float(segment.end),
+                    speaker=str(speaker),
+                )
+            )
+        return segments
+    @staticmethod
+    def save_rttm(diarization, output_path: str | Path) -> Path:
+        path = Path(output_path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        diarization.write_rttm(path)
+        return path
+    def run(self, audio_path: str | Path, show_progress: bool = True) -> List[Segment]:
+        """Chạy pipeline và trả về danh sách segment."""
+        diarization = self.diarize(audio_path, show_progress=show_progress)
+        return self.to_segments(diarization)

src/utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+from typing import Iterable, List
+def read_hf_token(token: str | None = None, key_path: str | Path = "hugging_face_key.txt") -> str:
+    """Ưu tiên token truyền vào, nếu không thì đọc từ biến môi trường hoặc file."""
+    candidates = [
+        token,
+        os.getenv("HUGGINGFACE_TOKEN"),
+        os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
+    ]
+    for value in candidates:
+        if value and value.strip():
+            return value.strip()
+    path = Path(key_path)
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Không tìm thấy token. Truyền biến --hf-token hoặc đặt file {path}."
+        )
+    content = path.read_text(encoding="utf-8").strip()
+    if not content:
+        raise ValueError(f"File {path} trống, hãy dán Hugging Face access token vào.")
+    return content
+def ensure_audio_path(audio_path: str | Path) -> Path:
+    """Kiểm tra đường dẫn audio hợp lệ."""
+    path = Path(audio_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Không tìm thấy file âm thanh: {path}")
+    if not path.is_file():
+        raise ValueError(f"Đường dẫn không phải file: {path}")
+    return path
+def export_segments_json(segments: Iterable[dict], output_path: str | Path) -> Path:
+    """Lưu danh sách segment thành JSON."""
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    data: List[dict] = list(segments)
+    path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
+    return path
+def format_segments_table(segments: Iterable[dict]) -> str:
+    """Trả về chuỗi bảng đơn giản để in ra terminal."""
+    lines = []
+    for idx, seg in enumerate(segments, start=1):
+        start = seg.get("start", 0.0)
+        end = seg.get("end", 0.0)
+        speaker = seg.get("speaker", "unknown")
+        lines.append(f"{idx:02d} | {start:7.2f}s -> {end:7.2f}s | speaker {speaker}")
+    return "\n".join(lines)