Spaces:
Sleeping
Sleeping
Fagan Valiyev commited on
Commit ·
d7efa84
0
Parent(s):
initial
Browse files- .gitattributes +1 -0
- Dockerfile +12 -0
- README.md +90 -0
- admin_panel.py +244 -0
- app.py +330 -0
- audio_loader.py +53 -0
- auth.py +47 -0
- config.py +56 -0
- config.yaml +28 -0
- csv_persistence.py +88 -0
- data/audio/folder_A/10.wav +3 -0
- data/audio/folder_A/11.wav +3 -0
- data/audio/folder_B/12.wav +3 -0
- data/audio/folder_B/13.wav +3 -0
- data/audio/references/folder_A.json +4 -0
- data/audio/references/folder_B.json +4 -0
- docker-compose.yaml +16 -0
- models.py +25 -0
- reference.py +43 -0
- requirements.txt +3 -0
- resume.py +67 -0
- skip_persistence.py +84 -0
.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY . .
|
| 9 |
+
|
| 10 |
+
EXPOSE 8501
|
| 11 |
+
|
| 12 |
+
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true"]
|
README.md
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Audio Labeling Tool
|
| 2 |
+
|
| 3 |
+
A Streamlit application for transcription correction and metadata annotation by data labelers.
|
| 4 |
+
|
| 5 |
+
## Requirements
|
| 6 |
+
|
| 7 |
+
- Python 3.10+
|
| 8 |
+
- Docker & Docker Compose (for deployment)
|
| 9 |
+
|
| 10 |
+
## Filesystem Structure
|
| 11 |
+
|
| 12 |
+
```
|
| 13 |
+
/data/
|
| 14 |
+
├── audio/
|
| 15 |
+
│ ├── folder_A/ # Labeler A's source WAV files
|
| 16 |
+
│ └── folder_B/ # Labeler B's source WAV files
|
| 17 |
+
├── references/
|
| 18 |
+
│ ├── folder_A.json # {filename: transcription} for folder_A
|
| 19 |
+
│ └── folder_B.json # {filename: transcription} for folder_B
|
| 20 |
+
└── output/
|
| 21 |
+
├── labeler_A/
|
| 22 |
+
│ ├── audios_A/ # Clean audios (copied on accept)
|
| 23 |
+
│ └── labeler_a_metadata.csv
|
| 24 |
+
└── labeler_B/
|
| 25 |
+
├── audios_B/ # Clean audios (copied on accept)
|
| 26 |
+
└── labeler_b_metadata.csv
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
## Configuration
|
| 30 |
+
|
| 31 |
+
Edit `config.yaml` to set labeler credentials and paths:
|
| 32 |
+
|
| 33 |
+
```yaml
|
| 34 |
+
labelers:
|
| 35 |
+
labeler_a:
|
| 36 |
+
password: "your_password"
|
| 37 |
+
audio_folder: "/data/audio/folder_A"
|
| 38 |
+
reference_json: "/data/references/folder_A.json"
|
| 39 |
+
output_dir: "/data/output/labeler_A"
|
| 40 |
+
clean_audios_dir: "/data/output/labeler_A/audios_A"
|
| 41 |
+
labeler_b:
|
| 42 |
+
password: "your_password"
|
| 43 |
+
audio_folder: "/data/audio/folder_B"
|
| 44 |
+
reference_json: "/data/references/folder_B.json"
|
| 45 |
+
output_dir: "/data/output/labeler_B"
|
| 46 |
+
clean_audios_dir: "/data/output/labeler_B/audios_B"
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## Environment Variables
|
| 50 |
+
|
| 51 |
+
| Variable | Purpose | Default |
|
| 52 |
+
|----------|---------|---------|
|
| 53 |
+
| `ALT_CONFIG_PATH` | Path to config.yaml | `/app/config.yaml` |
|
| 54 |
+
| `LOG_DIR` | Directory for log files | `/var/log/audio_labeling_tool` |
|
| 55 |
+
|
| 56 |
+
## Deployment with Docker
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
# Build and run
|
| 60 |
+
docker-compose up -d --build
|
| 61 |
+
|
| 62 |
+
# View logs
|
| 63 |
+
docker-compose logs -f
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
The app will be available at `http://<server-ip>:8501`.
|
| 67 |
+
|
| 68 |
+
## Volume Mounts
|
| 69 |
+
|
| 70 |
+
| Host Path | Container Path | Mode |
|
| 71 |
+
|-----------|---------------|------|
|
| 72 |
+
| `/data/audio` | `/data/audio` | read-only |
|
| 73 |
+
| `/data/references` | `/data/references` | read-only |
|
| 74 |
+
| `/data/output` | `/data/output` | read-write |
|
| 75 |
+
| `./config.yaml` | `/app/config.yaml` | read-only |
|
| 76 |
+
|
| 77 |
+
## Filesystem Permissions
|
| 78 |
+
|
| 79 |
+
- Audio folders: read-only for the app process
|
| 80 |
+
- Output directory: read-write for the app process
|
| 81 |
+
- Config file: read-only
|
| 82 |
+
|
| 83 |
+
## Local Development
|
| 84 |
+
|
| 85 |
+
```bash
|
| 86 |
+
pip install -r requirements.txt
|
| 87 |
+
export ALT_CONFIG_PATH=./config.yaml
|
| 88 |
+
export LOG_DIR=./logs
|
| 89 |
+
streamlit run app.py
|
| 90 |
+
```
|
admin_panel.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Admin panel for tracking labeler progress and activity."""
|
| 2 |
+
|
| 3 |
+
import csv
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
|
| 10 |
+
from config import load_config
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def render_admin_panel():
|
| 16 |
+
"""Render the admin dashboard."""
|
| 17 |
+
config = load_config()
|
| 18 |
+
|
| 19 |
+
# Header with logout
|
| 20 |
+
col_title, col_logout = st.columns([4, 1])
|
| 21 |
+
with col_title:
|
| 22 |
+
st.title("Admin Panel")
|
| 23 |
+
with col_logout:
|
| 24 |
+
from auth import logout
|
| 25 |
+
if st.button("Logout"):
|
| 26 |
+
logout()
|
| 27 |
+
st.rerun()
|
| 28 |
+
|
| 29 |
+
# Tabs for different views
|
| 30 |
+
tab_progress, tab_metadata, tab_skipped, tab_activity, tab_downloads = st.tabs(
|
| 31 |
+
["Progress", "Metadata CSVs", "Skipped Audios", "Activity Log", "Downloads"]
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
with tab_progress:
|
| 35 |
+
_render_progress(config)
|
| 36 |
+
|
| 37 |
+
with tab_metadata:
|
| 38 |
+
_render_metadata_viewer(config)
|
| 39 |
+
|
| 40 |
+
with tab_skipped:
|
| 41 |
+
_render_skipped_viewer(config)
|
| 42 |
+
|
| 43 |
+
with tab_activity:
|
| 44 |
+
_render_activity_log(config)
|
| 45 |
+
|
| 46 |
+
with tab_downloads:
|
| 47 |
+
_render_downloads(config)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _read_csv_rows(csv_path: str) -> list[dict]:
|
| 51 |
+
"""Read all rows from a CSV file."""
|
| 52 |
+
path = Path(csv_path)
|
| 53 |
+
if not path.exists():
|
| 54 |
+
return []
|
| 55 |
+
try:
|
| 56 |
+
with open(path, "r", encoding="utf-8", newline="") as f:
|
| 57 |
+
reader = csv.DictReader(f)
|
| 58 |
+
return [row for row in reader]
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.warning(f"Could not read CSV {csv_path}: {e}")
|
| 61 |
+
return []
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _count_audio_files(audio_folder: str) -> int:
|
| 65 |
+
"""Count WAV files in a folder."""
|
| 66 |
+
folder = Path(audio_folder)
|
| 67 |
+
if not folder.exists():
|
| 68 |
+
return 0
|
| 69 |
+
return sum(1 for f in folder.iterdir() if f.suffix.lower() == ".wav" and f.is_file())
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _render_progress(config: dict):
|
| 73 |
+
"""Render progress tracking for each labeler."""
|
| 74 |
+
st.subheader("Labeler Progress")
|
| 75 |
+
|
| 76 |
+
shared_output_dir = config["shared_output_dir"]
|
| 77 |
+
skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
|
| 78 |
+
skip_rows = _read_csv_rows(skip_csv_path)
|
| 79 |
+
|
| 80 |
+
for labeler_name, labeler_cfg in config["labelers"].items():
|
| 81 |
+
st.markdown(f"### {labeler_name}")
|
| 82 |
+
|
| 83 |
+
total_files = _count_audio_files(labeler_cfg["audio_folder"])
|
| 84 |
+
csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
|
| 85 |
+
labeled_rows = _read_csv_rows(csv_path)
|
| 86 |
+
labeled_count = len(labeled_rows)
|
| 87 |
+
|
| 88 |
+
# Count skips for this labeler
|
| 89 |
+
skipped_count = sum(1 for r in skip_rows if r.get("labeler") == labeler_name)
|
| 90 |
+
|
| 91 |
+
remaining = max(0, total_files - labeled_count - skipped_count)
|
| 92 |
+
|
| 93 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 94 |
+
col1.metric("Total", total_files)
|
| 95 |
+
col2.metric("Labeled", labeled_count)
|
| 96 |
+
col3.metric("Skipped", skipped_count)
|
| 97 |
+
col4.metric("Remaining", remaining)
|
| 98 |
+
|
| 99 |
+
if total_files > 0:
|
| 100 |
+
progress = (labeled_count + skipped_count) / total_files
|
| 101 |
+
st.progress(min(progress, 1.0))
|
| 102 |
+
else:
|
| 103 |
+
st.progress(0.0)
|
| 104 |
+
|
| 105 |
+
st.divider()
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _render_metadata_viewer(config: dict):
|
| 109 |
+
"""Render metadata CSV viewer for each labeler."""
|
| 110 |
+
st.subheader("Metadata CSVs")
|
| 111 |
+
|
| 112 |
+
for labeler_name, labeler_cfg in config["labelers"].items():
|
| 113 |
+
csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
|
| 114 |
+
rows = _read_csv_rows(csv_path)
|
| 115 |
+
|
| 116 |
+
with st.expander(f"{labeler_name} — {len(rows)} entries", expanded=False):
|
| 117 |
+
if rows:
|
| 118 |
+
st.dataframe(rows, use_container_width=True)
|
| 119 |
+
else:
|
| 120 |
+
st.info("No labels recorded yet.")
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def _render_skipped_viewer(config: dict):
|
| 124 |
+
"""Render skipped audios CSV viewer."""
|
| 125 |
+
st.subheader("Skipped Audios")
|
| 126 |
+
|
| 127 |
+
shared_output_dir = config["shared_output_dir"]
|
| 128 |
+
skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
|
| 129 |
+
rows = _read_csv_rows(skip_csv_path)
|
| 130 |
+
|
| 131 |
+
if rows:
|
| 132 |
+
# Filter by labeler
|
| 133 |
+
labelers = list(config["labelers"].keys())
|
| 134 |
+
selected_labeler = st.selectbox(
|
| 135 |
+
"Filter by labeler", options=["All"] + labelers, key="skip_filter"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
if selected_labeler != "All":
|
| 139 |
+
rows = [r for r in rows if r.get("labeler") == selected_labeler]
|
| 140 |
+
|
| 141 |
+
st.dataframe(rows, use_container_width=True)
|
| 142 |
+
st.caption(f"Total skipped: {len(rows)}")
|
| 143 |
+
else:
|
| 144 |
+
st.info("No skipped audios recorded yet.")
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _render_activity_log(config: dict):
|
| 148 |
+
"""Render recent activity from both metadata and skip CSVs."""
|
| 149 |
+
st.subheader("Activity Log")
|
| 150 |
+
|
| 151 |
+
# Collect all activity with timestamps
|
| 152 |
+
activities = []
|
| 153 |
+
|
| 154 |
+
# From metadata CSVs (no timestamp column, so we just show them)
|
| 155 |
+
for labeler_name, labeler_cfg in config["labelers"].items():
|
| 156 |
+
csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
|
| 157 |
+
rows = _read_csv_rows(csv_path)
|
| 158 |
+
for row in rows:
|
| 159 |
+
activities.append({
|
| 160 |
+
"labeler": row.get("labeler", labeler_name),
|
| 161 |
+
"action": "labeled",
|
| 162 |
+
"source": row.get("source", ""),
|
| 163 |
+
"details": f"gender={row.get('gender', '')}, pii={row.get('pii', '')}",
|
| 164 |
+
"timestamp": "",
|
| 165 |
+
})
|
| 166 |
+
|
| 167 |
+
# From skip CSV (has timestamps)
|
| 168 |
+
shared_output_dir = config["shared_output_dir"]
|
| 169 |
+
skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
|
| 170 |
+
skip_rows = _read_csv_rows(skip_csv_path)
|
| 171 |
+
for row in skip_rows:
|
| 172 |
+
activities.append({
|
| 173 |
+
"labeler": row.get("labeler", ""),
|
| 174 |
+
"action": "skipped",
|
| 175 |
+
"source": row.get("source", ""),
|
| 176 |
+
"details": row.get("reason", ""),
|
| 177 |
+
"timestamp": row.get("timestamp", ""),
|
| 178 |
+
})
|
| 179 |
+
|
| 180 |
+
# Sort by timestamp (skips have timestamps, labels don't — labels go to end)
|
| 181 |
+
activities.sort(key=lambda x: x["timestamp"] or "0", reverse=True)
|
| 182 |
+
|
| 183 |
+
if activities:
|
| 184 |
+
# Show last 50
|
| 185 |
+
st.dataframe(activities[:50], use_container_width=True)
|
| 186 |
+
st.caption(f"Showing latest {min(50, len(activities))} of {len(activities)} total actions.")
|
| 187 |
+
else:
|
| 188 |
+
st.info("No activity recorded yet.")
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def _render_downloads(config: dict):
|
| 192 |
+
"""Render download buttons for CSV files."""
|
| 193 |
+
st.subheader("Download Reports")
|
| 194 |
+
|
| 195 |
+
# Metadata CSVs
|
| 196 |
+
for labeler_name, labeler_cfg in config["labelers"].items():
|
| 197 |
+
csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
|
| 198 |
+
path = Path(csv_path)
|
| 199 |
+
if path.exists():
|
| 200 |
+
data = path.read_bytes()
|
| 201 |
+
st.download_button(
|
| 202 |
+
label=f"📥 {labeler_name}_metadata.csv",
|
| 203 |
+
data=data,
|
| 204 |
+
file_name=f"{labeler_name}_metadata.csv",
|
| 205 |
+
mime="text/csv",
|
| 206 |
+
key=f"dl_metadata_{labeler_name}",
|
| 207 |
+
)
|
| 208 |
+
else:
|
| 209 |
+
st.caption(f"{labeler_name}_metadata.csv — not yet created")
|
| 210 |
+
|
| 211 |
+
st.divider()
|
| 212 |
+
|
| 213 |
+
# Skipped CSV
|
| 214 |
+
shared_output_dir = config["shared_output_dir"]
|
| 215 |
+
skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
|
| 216 |
+
skip_path = Path(skip_csv_path)
|
| 217 |
+
if skip_path.exists():
|
| 218 |
+
data = skip_path.read_bytes()
|
| 219 |
+
st.download_button(
|
| 220 |
+
label="📥 skipped_audios.csv",
|
| 221 |
+
data=data,
|
| 222 |
+
file_name="skipped_audios.csv",
|
| 223 |
+
mime="text/csv",
|
| 224 |
+
key="dl_skipped",
|
| 225 |
+
)
|
| 226 |
+
else:
|
| 227 |
+
st.caption("skipped_audios.csv — not yet created")
|
| 228 |
+
|
| 229 |
+
st.divider()
|
| 230 |
+
|
| 231 |
+
# App log file
|
| 232 |
+
log_dir = os.environ.get("LOG_DIR", "/var/log/audio_labeling_tool")
|
| 233 |
+
log_path = Path(log_dir) / "app.log"
|
| 234 |
+
if log_path.exists():
|
| 235 |
+
data = log_path.read_bytes()
|
| 236 |
+
st.download_button(
|
| 237 |
+
label="📥 app.log",
|
| 238 |
+
data=data,
|
| 239 |
+
file_name="app.log",
|
| 240 |
+
mime="text/plain",
|
| 241 |
+
key="dl_log",
|
| 242 |
+
)
|
| 243 |
+
else:
|
| 244 |
+
st.caption("app.log — not yet created")
|
app.py
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Audio Labeling Tool — Streamlit Application Entry Point."""
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
import streamlit as st
|
| 8 |
+
|
| 9 |
+
from admin_panel import render_admin_panel
|
| 10 |
+
from auth import authenticate, login, logout
|
| 11 |
+
from audio_loader import copy_to_clean, load_audio_bytes
|
| 12 |
+
from config import load_config
|
| 13 |
+
from csv_persistence import save_label
|
| 14 |
+
from models import LabelRecord
|
| 15 |
+
from reference import load_reference
|
| 16 |
+
from resume import build_file_list, compute_resume_index
|
| 17 |
+
from skip_persistence import save_skip
|
| 18 |
+
|
| 19 |
+
# --- Logging Setup ---
|
| 20 |
+
LOG_DIR = os.environ.get("LOG_DIR", "/var/log/audio_labeling_tool")
|
| 21 |
+
os.makedirs(LOG_DIR, exist_ok=True)
|
| 22 |
+
|
| 23 |
+
logging.basicConfig(
|
| 24 |
+
level=logging.INFO,
|
| 25 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
| 26 |
+
handlers=[
|
| 27 |
+
logging.FileHandler(os.path.join(LOG_DIR, "app.log")),
|
| 28 |
+
logging.StreamHandler(),
|
| 29 |
+
],
|
| 30 |
+
)
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
|
| 33 |
+
# --- Page Config ---
|
| 34 |
+
st.set_page_config(page_title="Audio Labeling Tool", layout="wide")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def init_session_state():
|
| 38 |
+
"""Initialize session state defaults."""
|
| 39 |
+
if "authenticated" not in st.session_state:
|
| 40 |
+
st.session_state["authenticated"] = False
|
| 41 |
+
if "username" not in st.session_state:
|
| 42 |
+
st.session_state["username"] = None
|
| 43 |
+
if "role" not in st.session_state:
|
| 44 |
+
st.session_state["role"] = None
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def render_login():
|
| 48 |
+
"""Render the login view."""
|
| 49 |
+
st.title("Audio Labeling Tool")
|
| 50 |
+
st.subheader("Login")
|
| 51 |
+
|
| 52 |
+
username = st.text_input("Username", key="login_username")
|
| 53 |
+
password = st.text_input("Password", type="password", key="login_password")
|
| 54 |
+
|
| 55 |
+
if st.button("Login"):
|
| 56 |
+
if not username or not password:
|
| 57 |
+
st.error("Please enter both username and password.")
|
| 58 |
+
return
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
role = authenticate(username, password)
|
| 62 |
+
if role:
|
| 63 |
+
login(username, role)
|
| 64 |
+
st.rerun()
|
| 65 |
+
else:
|
| 66 |
+
st.error("Invalid username or password.")
|
| 67 |
+
except Exception:
|
| 68 |
+
st.error("Invalid username or password.")
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def get_labeler_config(username: str) -> dict:
|
| 72 |
+
"""Get the configuration for the current labeler."""
|
| 73 |
+
config = load_config()
|
| 74 |
+
return config["labelers"][username]
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def initialize_labeling_session(username: str):
|
| 78 |
+
"""Initialize the labeling session: build file list, load reference, compute resume."""
|
| 79 |
+
if "file_list" in st.session_state:
|
| 80 |
+
return # Already initialized
|
| 81 |
+
|
| 82 |
+
labeler_cfg = get_labeler_config(username)
|
| 83 |
+
audio_folder = labeler_cfg["audio_folder"]
|
| 84 |
+
reference_json = labeler_cfg["reference_json"]
|
| 85 |
+
output_dir = labeler_cfg["output_dir"]
|
| 86 |
+
csv_path = os.path.join(output_dir, f"{username}_metadata.csv")
|
| 87 |
+
|
| 88 |
+
# Build file list
|
| 89 |
+
file_list = build_file_list(audio_folder)
|
| 90 |
+
st.session_state["file_list"] = file_list
|
| 91 |
+
st.session_state["audio_folder"] = audio_folder
|
| 92 |
+
st.session_state["csv_path"] = csv_path
|
| 93 |
+
st.session_state["output_dir"] = output_dir
|
| 94 |
+
st.session_state["clean_audios_dir"] = labeler_cfg["clean_audios_dir"]
|
| 95 |
+
|
| 96 |
+
# Load reference JSON
|
| 97 |
+
try:
|
| 98 |
+
reference = load_reference(reference_json)
|
| 99 |
+
st.session_state["reference"] = reference
|
| 100 |
+
st.session_state["reference_error"] = None
|
| 101 |
+
except (FileNotFoundError, ValueError) as e:
|
| 102 |
+
st.session_state["reference"] = {}
|
| 103 |
+
st.session_state["reference_error"] = str(e)
|
| 104 |
+
|
| 105 |
+
# Compute resume index
|
| 106 |
+
resume_index = compute_resume_index(file_list, csv_path, username)
|
| 107 |
+
st.session_state["current_index"] = resume_index
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def render_audio_player(audio_bytes: bytes):
|
| 111 |
+
"""Render HTML5 audio player with speed control."""
|
| 112 |
+
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
| 113 |
+
audio_html = f"""
|
| 114 |
+
<audio id="audio-player" controls style="width: 100%;">
|
| 115 |
+
<source src="data:audio/wav;base64,{audio_b64}" type="audio/wav">
|
| 116 |
+
Your browser does not support the audio element.
|
| 117 |
+
</audio>
|
| 118 |
+
<script>
|
| 119 |
+
var audio = document.getElementById('audio-player');
|
| 120 |
+
var rate = document.getElementById('playback-rate');
|
| 121 |
+
if (rate) {{
|
| 122 |
+
audio.playbackRate = parseFloat(rate.value);
|
| 123 |
+
}}
|
| 124 |
+
</script>
|
| 125 |
+
"""
|
| 126 |
+
st.markdown(audio_html, unsafe_allow_html=True)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def render_labeling_ui():
|
| 130 |
+
"""Render the main labeling interface."""
|
| 131 |
+
username = st.session_state["username"]
|
| 132 |
+
|
| 133 |
+
# Header with logout
|
| 134 |
+
col_title, col_logout = st.columns([4, 1])
|
| 135 |
+
with col_title:
|
| 136 |
+
st.title("Audio Labeling Tool")
|
| 137 |
+
with col_logout:
|
| 138 |
+
if st.button("Logout"):
|
| 139 |
+
logout()
|
| 140 |
+
st.rerun()
|
| 141 |
+
|
| 142 |
+
# Initialize session
|
| 143 |
+
initialize_labeling_session(username)
|
| 144 |
+
|
| 145 |
+
# Check for reference loading error
|
| 146 |
+
if st.session_state.get("reference_error"):
|
| 147 |
+
st.error(
|
| 148 |
+
"Reference file is corrupted or missing. Please contact admin."
|
| 149 |
+
)
|
| 150 |
+
st.stop()
|
| 151 |
+
|
| 152 |
+
file_list = st.session_state["file_list"]
|
| 153 |
+
current_index = st.session_state["current_index"]
|
| 154 |
+
audio_folder = st.session_state["audio_folder"]
|
| 155 |
+
reference = st.session_state["reference"]
|
| 156 |
+
|
| 157 |
+
# Handle completion
|
| 158 |
+
if not file_list:
|
| 159 |
+
st.warning("No audio files found in your assigned folder.")
|
| 160 |
+
st.stop()
|
| 161 |
+
|
| 162 |
+
if current_index >= len(file_list):
|
| 163 |
+
st.success("All items have been labeled! You're done.")
|
| 164 |
+
if st.button("← Go to last item"):
|
| 165 |
+
st.session_state["current_index"] = len(file_list) - 1
|
| 166 |
+
st.rerun()
|
| 167 |
+
st.stop()
|
| 168 |
+
|
| 169 |
+
# Current file info
|
| 170 |
+
current_filename = file_list[current_index]
|
| 171 |
+
|
| 172 |
+
# Position indicator
|
| 173 |
+
st.markdown(f"**{current_index + 1} / {len(file_list)}** — `{current_filename}`")
|
| 174 |
+
|
| 175 |
+
# Audio player
|
| 176 |
+
try:
|
| 177 |
+
audio_bytes = load_audio_bytes(audio_folder, current_filename)
|
| 178 |
+
render_audio_player(audio_bytes)
|
| 179 |
+
except FileNotFoundError:
|
| 180 |
+
st.error(f"Audio file not found: {current_filename}. Please contact admin.")
|
| 181 |
+
|
| 182 |
+
# Speed control
|
| 183 |
+
speed = st.select_slider(
|
| 184 |
+
"Playback Speed",
|
| 185 |
+
options=[0.5, 0.75, 1.0, 1.25, 1.5, 2.0],
|
| 186 |
+
value=1.0,
|
| 187 |
+
key="speed_slider",
|
| 188 |
+
)
|
| 189 |
+
# Update playback rate via JS
|
| 190 |
+
st.markdown(
|
| 191 |
+
f"""<script>
|
| 192 |
+
var audio = document.getElementById('audio-player');
|
| 193 |
+
if (audio) {{ audio.playbackRate = {speed}; }}
|
| 194 |
+
</script>""",
|
| 195 |
+
unsafe_allow_html=True,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
st.divider()
|
| 199 |
+
|
| 200 |
+
# Transcription
|
| 201 |
+
default_transcription = reference.get(current_filename, "")
|
| 202 |
+
if not default_transcription and current_filename not in reference:
|
| 203 |
+
st.warning(f"No transcription found for: {current_filename}")
|
| 204 |
+
|
| 205 |
+
transcription = st.text_area(
|
| 206 |
+
"Transcription",
|
| 207 |
+
value=default_transcription,
|
| 208 |
+
height=100,
|
| 209 |
+
key=f"transcription_{current_index}",
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Metadata
|
| 213 |
+
col_gender, col_pii = st.columns(2)
|
| 214 |
+
with col_gender:
|
| 215 |
+
gender = st.radio(
|
| 216 |
+
"Gender",
|
| 217 |
+
options=["male", "female"],
|
| 218 |
+
key=f"gender_{current_index}",
|
| 219 |
+
)
|
| 220 |
+
with col_pii:
|
| 221 |
+
pii = st.checkbox("Contains PII", key=f"pii_{current_index}")
|
| 222 |
+
|
| 223 |
+
st.divider()
|
| 224 |
+
|
| 225 |
+
# Navigation and action buttons
|
| 226 |
+
col_prev, col_next, col_apply, col_skip = st.columns(4)
|
| 227 |
+
|
| 228 |
+
with col_prev:
|
| 229 |
+
prev_disabled = current_index <= 0
|
| 230 |
+
if st.button("← Previous", disabled=prev_disabled):
|
| 231 |
+
st.session_state["current_index"] = current_index - 1
|
| 232 |
+
st.rerun()
|
| 233 |
+
|
| 234 |
+
with col_next:
|
| 235 |
+
next_disabled = current_index >= len(file_list) - 1
|
| 236 |
+
if st.button("Next →", disabled=next_disabled):
|
| 237 |
+
st.session_state["current_index"] = current_index + 1
|
| 238 |
+
st.rerun()
|
| 239 |
+
|
| 240 |
+
with col_apply:
|
| 241 |
+
if st.button("✓ Apply", type="primary"):
|
| 242 |
+
# Build record
|
| 243 |
+
record = LabelRecord(
|
| 244 |
+
source=current_filename,
|
| 245 |
+
transcription=transcription,
|
| 246 |
+
gender=gender,
|
| 247 |
+
pii=pii,
|
| 248 |
+
labeler=username,
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
csv_path = st.session_state["csv_path"]
|
| 252 |
+
clean_audios_dir = st.session_state["clean_audios_dir"]
|
| 253 |
+
|
| 254 |
+
try:
|
| 255 |
+
# Save to CSV
|
| 256 |
+
save_label(record, csv_path)
|
| 257 |
+
# Copy audio to clean folder
|
| 258 |
+
copy_to_clean(audio_folder, current_filename, clean_audios_dir)
|
| 259 |
+
# Advance pointer only on success
|
| 260 |
+
st.session_state["current_index"] = current_index + 1
|
| 261 |
+
st.rerun()
|
| 262 |
+
except IOError as e:
|
| 263 |
+
st.error(str(e))
|
| 264 |
+
# Pointer NOT advanced
|
| 265 |
+
|
| 266 |
+
with col_skip:
|
| 267 |
+
skip_disabled = current_index >= len(file_list) - 1
|
| 268 |
+
if st.button("Skip ✗", disabled=skip_disabled):
|
| 269 |
+
st.session_state["show_skip_reason"] = True
|
| 270 |
+
st.rerun()
|
| 271 |
+
|
| 272 |
+
# Skip reason dialog
|
| 273 |
+
if st.session_state.get("show_skip_reason", False):
|
| 274 |
+
st.divider()
|
| 275 |
+
st.markdown("**Why are you skipping this audio?**")
|
| 276 |
+
|
| 277 |
+
config = load_config()
|
| 278 |
+
skip_reasons = config.get("skip_reasons", ["Other"])
|
| 279 |
+
|
| 280 |
+
reason_choice = st.selectbox(
|
| 281 |
+
"Select reason",
|
| 282 |
+
options=skip_reasons,
|
| 283 |
+
key=f"skip_reason_select_{current_index}",
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
custom_reason = ""
|
| 287 |
+
if reason_choice == "Other":
|
| 288 |
+
custom_reason = st.text_input(
|
| 289 |
+
"Please specify:", key=f"skip_custom_reason_{current_index}"
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
col_confirm, col_cancel = st.columns(2)
|
| 293 |
+
with col_confirm:
|
| 294 |
+
if st.button("Confirm Skip"):
|
| 295 |
+
final_reason = custom_reason if reason_choice == "Other" else reason_choice
|
| 296 |
+
if reason_choice == "Other" and not custom_reason.strip():
|
| 297 |
+
st.error("Please provide a reason.")
|
| 298 |
+
else:
|
| 299 |
+
shared_output_dir = config["shared_output_dir"]
|
| 300 |
+
skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
|
| 301 |
+
try:
|
| 302 |
+
save_skip(username, current_filename, final_reason, skip_csv_path)
|
| 303 |
+
st.session_state["show_skip_reason"] = False
|
| 304 |
+
st.session_state["current_index"] = current_index + 1
|
| 305 |
+
st.rerun()
|
| 306 |
+
except IOError as e:
|
| 307 |
+
st.error(str(e))
|
| 308 |
+
|
| 309 |
+
with col_cancel:
|
| 310 |
+
if st.button("Cancel"):
|
| 311 |
+
st.session_state["show_skip_reason"] = False
|
| 312 |
+
st.rerun()
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def main():
|
| 316 |
+
"""Main application entry point."""
|
| 317 |
+
init_session_state()
|
| 318 |
+
|
| 319 |
+
if st.session_state["authenticated"]:
|
| 320 |
+
role = st.session_state.get("role")
|
| 321 |
+
if role == "admin":
|
| 322 |
+
render_admin_panel()
|
| 323 |
+
else:
|
| 324 |
+
render_labeling_ui()
|
| 325 |
+
else:
|
| 326 |
+
render_login()
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
if __name__ == "__main__":
|
| 330 |
+
main()
|
audio_loader.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Lazy per-item audio loading and clean audio copy."""
|
| 2 |
+
|
| 3 |
+
import shutil
|
| 4 |
+
import logging
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def load_audio_bytes(folder_path: str, filename: str) -> bytes:
|
| 11 |
+
"""Load audio bytes for a single file on demand.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
folder_path: Path to the audio folder.
|
| 15 |
+
filename: Name of the WAV file.
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
Raw bytes of the audio file.
|
| 19 |
+
|
| 20 |
+
Raises:
|
| 21 |
+
FileNotFoundError: If the audio file does not exist.
|
| 22 |
+
"""
|
| 23 |
+
audio_path = Path(folder_path) / filename
|
| 24 |
+
if not audio_path.exists():
|
| 25 |
+
raise FileNotFoundError(f"Audio file not found: {filename}")
|
| 26 |
+
|
| 27 |
+
return audio_path.read_bytes()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def copy_to_clean(source_folder: str, filename: str, clean_audios_dir: str) -> None:
|
| 31 |
+
"""Copy accepted audio to the clean audios folder.
|
| 32 |
+
|
| 33 |
+
Creates the clean audios directory if it doesn't exist.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
source_folder: Path to the source audio folder.
|
| 37 |
+
filename: Name of the WAV file to copy.
|
| 38 |
+
clean_audios_dir: Path to the destination clean audios folder.
|
| 39 |
+
|
| 40 |
+
Raises:
|
| 41 |
+
IOError: If the copy fails.
|
| 42 |
+
"""
|
| 43 |
+
src = Path(source_folder) / filename
|
| 44 |
+
dst_dir = Path(clean_audios_dir)
|
| 45 |
+
dst_dir.mkdir(parents=True, exist_ok=True)
|
| 46 |
+
dst = dst_dir / filename
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
shutil.copy2(str(src), str(dst))
|
| 50 |
+
logger.info(f"Copied '{filename}' to clean audios: {clean_audios_dir}")
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.error(f"Failed to copy '{filename}' to clean audios: {e}")
|
| 53 |
+
raise IOError(f"Failed to copy audio file '{filename}'.") from e
|
auth.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Authentication logic for the Audio Labeling Tool."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
import streamlit as st
|
| 6 |
+
|
| 7 |
+
from config import load_config
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def authenticate(username: str, password: str) -> str | None:
|
| 13 |
+
"""Validate credentials against config.
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
"admin" if admin credentials, "labeler" if labeler credentials, None if invalid.
|
| 17 |
+
"""
|
| 18 |
+
config = load_config()
|
| 19 |
+
|
| 20 |
+
# Check admin
|
| 21 |
+
admin_cfg = config["admin"]
|
| 22 |
+
if username == admin_cfg["username"] and password == admin_cfg["password"]:
|
| 23 |
+
return "admin"
|
| 24 |
+
|
| 25 |
+
# Check labelers
|
| 26 |
+
labeler = config["labelers"].get(username)
|
| 27 |
+
if labeler is not None and labeler["password"] == password:
|
| 28 |
+
return "labeler"
|
| 29 |
+
|
| 30 |
+
return None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def login(username: str, role: str) -> None:
|
| 34 |
+
"""Set session state on successful login."""
|
| 35 |
+
st.session_state["authenticated"] = True
|
| 36 |
+
st.session_state["username"] = username
|
| 37 |
+
st.session_state["role"] = role
|
| 38 |
+
logger.info(f"User '{username}' logged in as '{role}'.")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def logout() -> None:
|
| 42 |
+
"""Clear all session state and return to login."""
|
| 43 |
+
username = st.session_state.get("username", "unknown")
|
| 44 |
+
for key in list(st.session_state.keys()):
|
| 45 |
+
del st.session_state[key]
|
| 46 |
+
st.session_state["authenticated"] = False
|
| 47 |
+
logger.info(f"User '{username}' logged out.")
|
config.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration loading and validation for the Audio Labeling Tool."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import yaml
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
DEFAULT_CONFIG_PATH = "/app/config.yaml"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def load_config() -> dict:
|
| 15 |
+
"""Load application configuration from YAML file.
|
| 16 |
+
|
| 17 |
+
Config path resolution:
|
| 18 |
+
1. ALT_CONFIG_PATH environment variable
|
| 19 |
+
2. Default path: /app/config.yaml
|
| 20 |
+
"""
|
| 21 |
+
config_path = os.environ.get("ALT_CONFIG_PATH", DEFAULT_CONFIG_PATH)
|
| 22 |
+
path = Path(config_path)
|
| 23 |
+
|
| 24 |
+
if not path.exists():
|
| 25 |
+
raise FileNotFoundError(f"Configuration file not found: {config_path}")
|
| 26 |
+
|
| 27 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 28 |
+
config = yaml.safe_load(f)
|
| 29 |
+
|
| 30 |
+
_validate_config(config)
|
| 31 |
+
return config
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _validate_config(config: dict) -> None:
|
| 35 |
+
"""Validate config structure."""
|
| 36 |
+
if "labelers" not in config:
|
| 37 |
+
raise ValueError("Config must contain 'labelers' key")
|
| 38 |
+
if len(config["labelers"]) != 2:
|
| 39 |
+
raise ValueError("Exactly 2 labelers must be configured")
|
| 40 |
+
for name, labeler_cfg in config["labelers"].items():
|
| 41 |
+
if "password" not in labeler_cfg:
|
| 42 |
+
raise ValueError(f"Labeler '{name}' missing 'password'")
|
| 43 |
+
if "audio_folder" not in labeler_cfg:
|
| 44 |
+
raise ValueError(f"Labeler '{name}' missing 'audio_folder'")
|
| 45 |
+
if "reference_json" not in labeler_cfg:
|
| 46 |
+
raise ValueError(f"Labeler '{name}' missing 'reference_json'")
|
| 47 |
+
if "output_dir" not in labeler_cfg:
|
| 48 |
+
raise ValueError(f"Labeler '{name}' missing 'output_dir'")
|
| 49 |
+
if "clean_audios_dir" not in labeler_cfg:
|
| 50 |
+
raise ValueError(f"Labeler '{name}' missing 'clean_audios_dir'")
|
| 51 |
+
if "admin" not in config:
|
| 52 |
+
raise ValueError("Config must contain 'admin' key")
|
| 53 |
+
if "username" not in config["admin"] or "password" not in config["admin"]:
|
| 54 |
+
raise ValueError("Admin must have 'username' and 'password'")
|
| 55 |
+
if "shared_output_dir" not in config:
|
| 56 |
+
raise ValueError("Config must contain 'shared_output_dir'")
|
config.yaml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
labelers:
|
| 2 |
+
labeler_a:
|
| 3 |
+
password: "password_a"
|
| 4 |
+
audio_folder: "/data/audio/folder_A"
|
| 5 |
+
reference_json: "/data/references/folder_A.json"
|
| 6 |
+
output_dir: "/data/output/labeler_A"
|
| 7 |
+
clean_audios_dir: "/data/output/labeler_A/audios_A"
|
| 8 |
+
labeler_b:
|
| 9 |
+
password: "password_b"
|
| 10 |
+
audio_folder: "/data/audio/folder_B"
|
| 11 |
+
reference_json: "/data/references/folder_B.json"
|
| 12 |
+
output_dir: "/data/output/labeler_B"
|
| 13 |
+
clean_audios_dir: "/data/output/labeler_B/audios_B"
|
| 14 |
+
|
| 15 |
+
admin:
|
| 16 |
+
username: "admin"
|
| 17 |
+
password: "admin_password"
|
| 18 |
+
|
| 19 |
+
skip_reasons:
|
| 20 |
+
- "Noisy audio"
|
| 21 |
+
- "Too short"
|
| 22 |
+
- "Unintelligible"
|
| 23 |
+
- "Wrong language"
|
| 24 |
+
- "Silence / no speech"
|
| 25 |
+
- "Corrupted file"
|
| 26 |
+
- "Other"
|
| 27 |
+
|
| 28 |
+
shared_output_dir: "/data/output"
|
csv_persistence.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CSV persistence layer with file locking and atomic writes."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import csv
|
| 5 |
+
import tempfile
|
| 6 |
+
import logging
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
from filelock import FileLock, Timeout
|
| 10 |
+
|
| 11 |
+
from models import LabelRecord
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
CSV_COLUMNS = ["source", "transcription", "gender", "pii", "labeler"]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def save_label(record: LabelRecord, csv_path: str) -> None:
|
| 19 |
+
"""Save a label record to CSV with file locking and atomic write.
|
| 20 |
+
|
| 21 |
+
Uses filelock for serialization and temp-file-then-rename for atomicity.
|
| 22 |
+
Implements upsert: overwrites existing row for same source+labeler,
|
| 23 |
+
or appends if new.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
record: The label record to save.
|
| 27 |
+
csv_path: Path to the target CSV file.
|
| 28 |
+
|
| 29 |
+
Raises:
|
| 30 |
+
IOError: If the write operation fails.
|
| 31 |
+
"""
|
| 32 |
+
lock_path = csv_path + ".lock"
|
| 33 |
+
lock = FileLock(lock_path, timeout=10)
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
with lock:
|
| 37 |
+
# Read existing data
|
| 38 |
+
rows: list[dict] = []
|
| 39 |
+
path = Path(csv_path)
|
| 40 |
+
if path.exists():
|
| 41 |
+
with open(path, "r", encoding="utf-8", newline="") as f:
|
| 42 |
+
reader = csv.DictReader(f)
|
| 43 |
+
rows = [row for row in reader]
|
| 44 |
+
|
| 45 |
+
# Upsert: overwrite existing row for this source+labeler, or append
|
| 46 |
+
new_row = {
|
| 47 |
+
"source": record.source,
|
| 48 |
+
"transcription": record.transcription,
|
| 49 |
+
"gender": record.gender,
|
| 50 |
+
"pii": str(record.pii),
|
| 51 |
+
"labeler": record.labeler,
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
updated = False
|
| 55 |
+
for i, row in enumerate(rows):
|
| 56 |
+
if row["source"] == record.source and row["labeler"] == record.labeler:
|
| 57 |
+
rows[i] = new_row
|
| 58 |
+
updated = True
|
| 59 |
+
break
|
| 60 |
+
|
| 61 |
+
if not updated:
|
| 62 |
+
rows.append(new_row)
|
| 63 |
+
|
| 64 |
+
# Write to temp file, then atomic rename
|
| 65 |
+
dir_name = os.path.dirname(csv_path) or "."
|
| 66 |
+
os.makedirs(dir_name, exist_ok=True)
|
| 67 |
+
fd, tmp_path = tempfile.mkstemp(dir=dir_name, suffix=".tmp")
|
| 68 |
+
try:
|
| 69 |
+
with os.fdopen(fd, "w", encoding="utf-8", newline="") as f:
|
| 70 |
+
writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
|
| 71 |
+
writer.writeheader()
|
| 72 |
+
writer.writerows(rows)
|
| 73 |
+
os.replace(tmp_path, csv_path)
|
| 74 |
+
logger.info(f"Saved label for '{record.source}' by '{record.labeler}'")
|
| 75 |
+
except Exception:
|
| 76 |
+
# Clean up temp file on failure
|
| 77 |
+
if os.path.exists(tmp_path):
|
| 78 |
+
os.unlink(tmp_path)
|
| 79 |
+
raise
|
| 80 |
+
|
| 81 |
+
except Timeout:
|
| 82 |
+
logger.error(f"Lock timeout for CSV: {csv_path}")
|
| 83 |
+
raise IOError("Failed to save annotation (file locked). Please try again.")
|
| 84 |
+
except IOError:
|
| 85 |
+
raise
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.error(f"Failed to save label for {record.source}: {e}")
|
| 88 |
+
raise IOError("Failed to save annotation. Please try again.") from e
|
data/audio/folder_A/10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ce48e25a4d9cbe9431b0b689b15ed16ea7c5e6e350a5b93fb89343b67f1dc5ac
|
| 3 |
+
size 347244
|
data/audio/folder_A/11.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:306a6cc6e882a689a93d90ccb1f1e62f7ce07437ab127499518760b45890de53
|
| 3 |
+
size 26284
|
data/audio/folder_B/12.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc6763766efe21c680ea048e7d1e0e6a51ec61d29fcac1b82c9d0bc18fdaa5d5
|
| 3 |
+
size 45484
|
data/audio/folder_B/13.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5fea62d7359eccc0ec083b7468bf52369a53ead8ffcd74c435915c50e772d2e7
|
| 3 |
+
size 69484
|
data/audio/references/folder_A.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"10.wav": "some transcription",
|
| 3 |
+
"11.wav": "another transcription"
|
| 4 |
+
}
|
data/audio/references/folder_B.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"clip12_001.wav": "12 transcription",
|
| 3 |
+
"clip13_002.wav": "13 transcription"
|
| 4 |
+
}
|
docker-compose.yaml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: "3.8"
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
audio-labeling-tool:
|
| 5 |
+
build: .
|
| 6 |
+
ports:
|
| 7 |
+
- "8501:8501"
|
| 8 |
+
volumes:
|
| 9 |
+
- /data/audio:/data/audio:ro
|
| 10 |
+
- /data/references:/data/references:ro
|
| 11 |
+
- /data/output:/data/output:rw
|
| 12 |
+
- ./config.yaml:/app/config.yaml:ro
|
| 13 |
+
environment:
|
| 14 |
+
- ALT_CONFIG_PATH=/app/config.yaml
|
| 15 |
+
- LOG_DIR=/app/logs
|
| 16 |
+
restart: unless-stopped
|
models.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data models for the Audio Labeling Tool."""
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import TypedDict
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class LabelRow(TypedDict):
|
| 8 |
+
"""TypedDict representing a single row in the metadata CSV."""
|
| 9 |
+
|
| 10 |
+
source: str # Audio filename only (e.g., "clip_001.wav")
|
| 11 |
+
transcription: str # Corrected transcription text
|
| 12 |
+
gender: str # "male" or "female"
|
| 13 |
+
pii: str # "True" or "False" (string representation in CSV)
|
| 14 |
+
labeler: str # Username of the labeler
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class LabelRecord:
|
| 19 |
+
"""Dataclass for constructing a label before saving."""
|
| 20 |
+
|
| 21 |
+
source: str
|
| 22 |
+
transcription: str
|
| 23 |
+
gender: str
|
| 24 |
+
pii: bool
|
| 25 |
+
labeler: str
|
reference.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Reference JSON loading and validation."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def load_reference(json_path: str) -> dict[str, str]:
|
| 11 |
+
"""Load and validate a reference JSON file.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
json_path: Path to the reference JSON file.
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
Dictionary mapping audio filenames to transcription strings.
|
| 18 |
+
|
| 19 |
+
Raises:
|
| 20 |
+
FileNotFoundError: If the JSON file does not exist.
|
| 21 |
+
ValueError: If the JSON is malformed or not a flat dict[str, str].
|
| 22 |
+
"""
|
| 23 |
+
path = Path(json_path)
|
| 24 |
+
if not path.exists():
|
| 25 |
+
raise FileNotFoundError(f"Reference file not found: {json_path}")
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 29 |
+
data = json.load(f)
|
| 30 |
+
except json.JSONDecodeError:
|
| 31 |
+
raise ValueError(f"Malformed JSON in {json_path}: unable to parse")
|
| 32 |
+
|
| 33 |
+
if not isinstance(data, dict):
|
| 34 |
+
raise ValueError(
|
| 35 |
+
f"Reference JSON must be a flat dict, got {type(data).__name__}"
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
for key, value in data.items():
|
| 39 |
+
if not isinstance(key, str) or not isinstance(value, str):
|
| 40 |
+
raise ValueError("All keys and values in reference JSON must be strings")
|
| 41 |
+
|
| 42 |
+
logger.info(f"Loaded reference JSON: {json_path} ({len(data)} entries)")
|
| 43 |
+
return data
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.28
|
| 2 |
+
filelock>=3.12
|
| 3 |
+
pyyaml>=6.0
|
resume.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Checkpoint and resume logic."""
|
| 2 |
+
|
| 3 |
+
import csv
|
| 4 |
+
import logging
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def build_file_list(audio_folder: str) -> list[str]:
|
| 11 |
+
"""Build an ordered list of WAV filenames from the assigned audio folder.
|
| 12 |
+
|
| 13 |
+
Files are sorted alphabetically.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
audio_folder: Path to the labeler's audio folder.
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
Sorted list of WAV filenames.
|
| 20 |
+
"""
|
| 21 |
+
folder_path = Path(audio_folder)
|
| 22 |
+
if not folder_path.exists():
|
| 23 |
+
logger.warning(f"Audio folder not found: {audio_folder}")
|
| 24 |
+
return []
|
| 25 |
+
|
| 26 |
+
wav_files = sorted(
|
| 27 |
+
f.name
|
| 28 |
+
for f in folder_path.iterdir()
|
| 29 |
+
if f.suffix.lower() == ".wav" and f.is_file()
|
| 30 |
+
)
|
| 31 |
+
logger.info(f"Found {len(wav_files)} WAV files in {audio_folder}")
|
| 32 |
+
return wav_files
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def compute_resume_index(
|
| 36 |
+
file_list: list[str],
|
| 37 |
+
csv_path: str,
|
| 38 |
+
labeler: str,
|
| 39 |
+
) -> int:
|
| 40 |
+
"""Compute the index of the first unlabeled item.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
file_list: Ordered list of audio filenames.
|
| 44 |
+
csv_path: Path to the labeler's CSV file.
|
| 45 |
+
labeler: The labeler's username.
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
Index of the first unlabeled file, or len(file_list) if all are labeled.
|
| 49 |
+
"""
|
| 50 |
+
labeled_sources: set[str] = set()
|
| 51 |
+
|
| 52 |
+
path = Path(csv_path)
|
| 53 |
+
if path.exists():
|
| 54 |
+
try:
|
| 55 |
+
with open(path, "r", encoding="utf-8", newline="") as f:
|
| 56 |
+
reader = csv.DictReader(f)
|
| 57 |
+
for row in reader:
|
| 58 |
+
if row.get("labeler") == labeler:
|
| 59 |
+
labeled_sources.add(row["source"])
|
| 60 |
+
except Exception as e:
|
| 61 |
+
logger.warning(f"Could not read CSV for resume: {e}")
|
| 62 |
+
|
| 63 |
+
for i, filename in enumerate(file_list):
|
| 64 |
+
if filename not in labeled_sources:
|
| 65 |
+
return i
|
| 66 |
+
|
| 67 |
+
return len(file_list) # All items labeled
|
skip_persistence.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Skip tracking persistence layer."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import csv
|
| 5 |
+
import tempfile
|
| 6 |
+
import logging
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from filelock import FileLock, Timeout
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
SKIP_CSV_COLUMNS = ["labeler", "source", "reason", "timestamp"]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def save_skip(labeler: str, source: str, reason: str, csv_path: str) -> None:
|
| 18 |
+
"""Save a skip record to the shared skipped_audios.csv.
|
| 19 |
+
|
| 20 |
+
Uses filelock + temp-file-then-rename for atomicity.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
labeler: Username of the labeler who skipped.
|
| 24 |
+
source: Audio filename.
|
| 25 |
+
reason: Reason for skipping.
|
| 26 |
+
csv_path: Path to the shared skipped_audios.csv.
|
| 27 |
+
|
| 28 |
+
Raises:
|
| 29 |
+
IOError: If the write operation fails.
|
| 30 |
+
"""
|
| 31 |
+
lock_path = csv_path + ".lock"
|
| 32 |
+
lock = FileLock(lock_path, timeout=10)
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
with lock:
|
| 36 |
+
rows: list[dict] = []
|
| 37 |
+
path = Path(csv_path)
|
| 38 |
+
if path.exists():
|
| 39 |
+
with open(path, "r", encoding="utf-8", newline="") as f:
|
| 40 |
+
reader = csv.DictReader(f)
|
| 41 |
+
rows = [row for row in reader]
|
| 42 |
+
|
| 43 |
+
new_row = {
|
| 44 |
+
"labeler": labeler,
|
| 45 |
+
"source": source,
|
| 46 |
+
"reason": reason,
|
| 47 |
+
"timestamp": datetime.now().isoformat(),
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
# Upsert: overwrite if same labeler+source already skipped
|
| 51 |
+
updated = False
|
| 52 |
+
for i, row in enumerate(rows):
|
| 53 |
+
if row["labeler"] == labeler and row["source"] == source:
|
| 54 |
+
rows[i] = new_row
|
| 55 |
+
updated = True
|
| 56 |
+
break
|
| 57 |
+
|
| 58 |
+
if not updated:
|
| 59 |
+
rows.append(new_row)
|
| 60 |
+
|
| 61 |
+
# Atomic write
|
| 62 |
+
dir_name = os.path.dirname(csv_path) or "."
|
| 63 |
+
os.makedirs(dir_name, exist_ok=True)
|
| 64 |
+
fd, tmp_path = tempfile.mkstemp(dir=dir_name, suffix=".tmp")
|
| 65 |
+
try:
|
| 66 |
+
with os.fdopen(fd, "w", encoding="utf-8", newline="") as f:
|
| 67 |
+
writer = csv.DictWriter(f, fieldnames=SKIP_CSV_COLUMNS)
|
| 68 |
+
writer.writeheader()
|
| 69 |
+
writer.writerows(rows)
|
| 70 |
+
os.replace(tmp_path, csv_path)
|
| 71 |
+
logger.info(f"Saved skip for '{source}' by '{labeler}': {reason}")
|
| 72 |
+
except Exception:
|
| 73 |
+
if os.path.exists(tmp_path):
|
| 74 |
+
os.unlink(tmp_path)
|
| 75 |
+
raise
|
| 76 |
+
|
| 77 |
+
except Timeout:
|
| 78 |
+
logger.error(f"Lock timeout for skip CSV: {csv_path}")
|
| 79 |
+
raise IOError("Failed to save skip record. Please try again.")
|
| 80 |
+
except IOError:
|
| 81 |
+
raise
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.error(f"Failed to save skip for {source}: {e}")
|
| 84 |
+
raise IOError("Failed to save skip record. Please try again.") from e
|