Fagan Valiyev commited on
Commit
d7efa84
·
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.wav filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ EXPOSE 8501
11
+
12
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true"]
README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audio Labeling Tool
2
+
3
+ A Streamlit application for transcription correction and metadata annotation by data labelers.
4
+
5
+ ## Requirements
6
+
7
+ - Python 3.10+
8
+ - Docker & Docker Compose (for deployment)
9
+
10
+ ## Filesystem Structure
11
+
12
+ ```
13
+ /data/
14
+ ├── audio/
15
+ │ ├── folder_A/ # Labeler A's source WAV files
16
+ │ └── folder_B/ # Labeler B's source WAV files
17
+ ├── references/
18
+ │ ├── folder_A.json # {filename: transcription} for folder_A
19
+ │ └── folder_B.json # {filename: transcription} for folder_B
20
+ └── output/
21
+ ├── labeler_A/
22
+ │ ├── audios_A/ # Clean audios (copied on accept)
23
+ │ └── labeler_a_metadata.csv
24
+ └── labeler_B/
25
+ ├── audios_B/ # Clean audios (copied on accept)
26
+ └── labeler_b_metadata.csv
27
+ ```
28
+
29
+ ## Configuration
30
+
31
+ Edit `config.yaml` to set labeler credentials and paths:
32
+
33
+ ```yaml
34
+ labelers:
35
+ labeler_a:
36
+ password: "your_password"
37
+ audio_folder: "/data/audio/folder_A"
38
+ reference_json: "/data/references/folder_A.json"
39
+ output_dir: "/data/output/labeler_A"
40
+ clean_audios_dir: "/data/output/labeler_A/audios_A"
41
+ labeler_b:
42
+ password: "your_password"
43
+ audio_folder: "/data/audio/folder_B"
44
+ reference_json: "/data/references/folder_B.json"
45
+ output_dir: "/data/output/labeler_B"
46
+ clean_audios_dir: "/data/output/labeler_B/audios_B"
47
+ ```
48
+
49
+ ## Environment Variables
50
+
51
+ | Variable | Purpose | Default |
52
+ |----------|---------|---------|
53
+ | `ALT_CONFIG_PATH` | Path to config.yaml | `/app/config.yaml` |
54
+ | `LOG_DIR` | Directory for log files | `/var/log/audio_labeling_tool` |
55
+
56
+ ## Deployment with Docker
57
+
58
+ ```bash
59
+ # Build and run
60
+ docker-compose up -d --build
61
+
62
+ # View logs
63
+ docker-compose logs -f
64
+ ```
65
+
66
+ The app will be available at `http://<server-ip>:8501`.
67
+
68
+ ## Volume Mounts
69
+
70
+ | Host Path | Container Path | Mode |
71
+ |-----------|---------------|------|
72
+ | `/data/audio` | `/data/audio` | read-only |
73
+ | `/data/references` | `/data/references` | read-only |
74
+ | `/data/output` | `/data/output` | read-write |
75
+ | `./config.yaml` | `/app/config.yaml` | read-only |
76
+
77
+ ## Filesystem Permissions
78
+
79
+ - Audio folders: read-only for the app process
80
+ - Output directory: read-write for the app process
81
+ - Config file: read-only
82
+
83
+ ## Local Development
84
+
85
+ ```bash
86
+ pip install -r requirements.txt
87
+ export ALT_CONFIG_PATH=./config.yaml
88
+ export LOG_DIR=./logs
89
+ streamlit run app.py
90
+ ```
admin_panel.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Admin panel for tracking labeler progress and activity."""
2
+
3
+ import csv
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+
8
+ import streamlit as st
9
+
10
+ from config import load_config
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def render_admin_panel():
16
+ """Render the admin dashboard."""
17
+ config = load_config()
18
+
19
+ # Header with logout
20
+ col_title, col_logout = st.columns([4, 1])
21
+ with col_title:
22
+ st.title("Admin Panel")
23
+ with col_logout:
24
+ from auth import logout
25
+ if st.button("Logout"):
26
+ logout()
27
+ st.rerun()
28
+
29
+ # Tabs for different views
30
+ tab_progress, tab_metadata, tab_skipped, tab_activity, tab_downloads = st.tabs(
31
+ ["Progress", "Metadata CSVs", "Skipped Audios", "Activity Log", "Downloads"]
32
+ )
33
+
34
+ with tab_progress:
35
+ _render_progress(config)
36
+
37
+ with tab_metadata:
38
+ _render_metadata_viewer(config)
39
+
40
+ with tab_skipped:
41
+ _render_skipped_viewer(config)
42
+
43
+ with tab_activity:
44
+ _render_activity_log(config)
45
+
46
+ with tab_downloads:
47
+ _render_downloads(config)
48
+
49
+
50
+ def _read_csv_rows(csv_path: str) -> list[dict]:
51
+ """Read all rows from a CSV file."""
52
+ path = Path(csv_path)
53
+ if not path.exists():
54
+ return []
55
+ try:
56
+ with open(path, "r", encoding="utf-8", newline="") as f:
57
+ reader = csv.DictReader(f)
58
+ return [row for row in reader]
59
+ except Exception as e:
60
+ logger.warning(f"Could not read CSV {csv_path}: {e}")
61
+ return []
62
+
63
+
64
+ def _count_audio_files(audio_folder: str) -> int:
65
+ """Count WAV files in a folder."""
66
+ folder = Path(audio_folder)
67
+ if not folder.exists():
68
+ return 0
69
+ return sum(1 for f in folder.iterdir() if f.suffix.lower() == ".wav" and f.is_file())
70
+
71
+
72
+ def _render_progress(config: dict):
73
+ """Render progress tracking for each labeler."""
74
+ st.subheader("Labeler Progress")
75
+
76
+ shared_output_dir = config["shared_output_dir"]
77
+ skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
78
+ skip_rows = _read_csv_rows(skip_csv_path)
79
+
80
+ for labeler_name, labeler_cfg in config["labelers"].items():
81
+ st.markdown(f"### {labeler_name}")
82
+
83
+ total_files = _count_audio_files(labeler_cfg["audio_folder"])
84
+ csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
85
+ labeled_rows = _read_csv_rows(csv_path)
86
+ labeled_count = len(labeled_rows)
87
+
88
+ # Count skips for this labeler
89
+ skipped_count = sum(1 for r in skip_rows if r.get("labeler") == labeler_name)
90
+
91
+ remaining = max(0, total_files - labeled_count - skipped_count)
92
+
93
+ col1, col2, col3, col4 = st.columns(4)
94
+ col1.metric("Total", total_files)
95
+ col2.metric("Labeled", labeled_count)
96
+ col3.metric("Skipped", skipped_count)
97
+ col4.metric("Remaining", remaining)
98
+
99
+ if total_files > 0:
100
+ progress = (labeled_count + skipped_count) / total_files
101
+ st.progress(min(progress, 1.0))
102
+ else:
103
+ st.progress(0.0)
104
+
105
+ st.divider()
106
+
107
+
108
+ def _render_metadata_viewer(config: dict):
109
+ """Render metadata CSV viewer for each labeler."""
110
+ st.subheader("Metadata CSVs")
111
+
112
+ for labeler_name, labeler_cfg in config["labelers"].items():
113
+ csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
114
+ rows = _read_csv_rows(csv_path)
115
+
116
+ with st.expander(f"{labeler_name} — {len(rows)} entries", expanded=False):
117
+ if rows:
118
+ st.dataframe(rows, use_container_width=True)
119
+ else:
120
+ st.info("No labels recorded yet.")
121
+
122
+
123
+ def _render_skipped_viewer(config: dict):
124
+ """Render skipped audios CSV viewer."""
125
+ st.subheader("Skipped Audios")
126
+
127
+ shared_output_dir = config["shared_output_dir"]
128
+ skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
129
+ rows = _read_csv_rows(skip_csv_path)
130
+
131
+ if rows:
132
+ # Filter by labeler
133
+ labelers = list(config["labelers"].keys())
134
+ selected_labeler = st.selectbox(
135
+ "Filter by labeler", options=["All"] + labelers, key="skip_filter"
136
+ )
137
+
138
+ if selected_labeler != "All":
139
+ rows = [r for r in rows if r.get("labeler") == selected_labeler]
140
+
141
+ st.dataframe(rows, use_container_width=True)
142
+ st.caption(f"Total skipped: {len(rows)}")
143
+ else:
144
+ st.info("No skipped audios recorded yet.")
145
+
146
+
147
+ def _render_activity_log(config: dict):
148
+ """Render recent activity from both metadata and skip CSVs."""
149
+ st.subheader("Activity Log")
150
+
151
+ # Collect all activity with timestamps
152
+ activities = []
153
+
154
+ # From metadata CSVs (no timestamp column, so we just show them)
155
+ for labeler_name, labeler_cfg in config["labelers"].items():
156
+ csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
157
+ rows = _read_csv_rows(csv_path)
158
+ for row in rows:
159
+ activities.append({
160
+ "labeler": row.get("labeler", labeler_name),
161
+ "action": "labeled",
162
+ "source": row.get("source", ""),
163
+ "details": f"gender={row.get('gender', '')}, pii={row.get('pii', '')}",
164
+ "timestamp": "",
165
+ })
166
+
167
+ # From skip CSV (has timestamps)
168
+ shared_output_dir = config["shared_output_dir"]
169
+ skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
170
+ skip_rows = _read_csv_rows(skip_csv_path)
171
+ for row in skip_rows:
172
+ activities.append({
173
+ "labeler": row.get("labeler", ""),
174
+ "action": "skipped",
175
+ "source": row.get("source", ""),
176
+ "details": row.get("reason", ""),
177
+ "timestamp": row.get("timestamp", ""),
178
+ })
179
+
180
+ # Sort by timestamp (skips have timestamps, labels don't — labels go to end)
181
+ activities.sort(key=lambda x: x["timestamp"] or "0", reverse=True)
182
+
183
+ if activities:
184
+ # Show last 50
185
+ st.dataframe(activities[:50], use_container_width=True)
186
+ st.caption(f"Showing latest {min(50, len(activities))} of {len(activities)} total actions.")
187
+ else:
188
+ st.info("No activity recorded yet.")
189
+
190
+
191
+ def _render_downloads(config: dict):
192
+ """Render download buttons for CSV files."""
193
+ st.subheader("Download Reports")
194
+
195
+ # Metadata CSVs
196
+ for labeler_name, labeler_cfg in config["labelers"].items():
197
+ csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
198
+ path = Path(csv_path)
199
+ if path.exists():
200
+ data = path.read_bytes()
201
+ st.download_button(
202
+ label=f"📥 {labeler_name}_metadata.csv",
203
+ data=data,
204
+ file_name=f"{labeler_name}_metadata.csv",
205
+ mime="text/csv",
206
+ key=f"dl_metadata_{labeler_name}",
207
+ )
208
+ else:
209
+ st.caption(f"{labeler_name}_metadata.csv — not yet created")
210
+
211
+ st.divider()
212
+
213
+ # Skipped CSV
214
+ shared_output_dir = config["shared_output_dir"]
215
+ skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
216
+ skip_path = Path(skip_csv_path)
217
+ if skip_path.exists():
218
+ data = skip_path.read_bytes()
219
+ st.download_button(
220
+ label="📥 skipped_audios.csv",
221
+ data=data,
222
+ file_name="skipped_audios.csv",
223
+ mime="text/csv",
224
+ key="dl_skipped",
225
+ )
226
+ else:
227
+ st.caption("skipped_audios.csv — not yet created")
228
+
229
+ st.divider()
230
+
231
+ # App log file
232
+ log_dir = os.environ.get("LOG_DIR", "/var/log/audio_labeling_tool")
233
+ log_path = Path(log_dir) / "app.log"
234
+ if log_path.exists():
235
+ data = log_path.read_bytes()
236
+ st.download_button(
237
+ label="📥 app.log",
238
+ data=data,
239
+ file_name="app.log",
240
+ mime="text/plain",
241
+ key="dl_log",
242
+ )
243
+ else:
244
+ st.caption("app.log — not yet created")
app.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Audio Labeling Tool — Streamlit Application Entry Point."""
2
+
3
+ import base64
4
+ import logging
5
+ import os
6
+
7
+ import streamlit as st
8
+
9
+ from admin_panel import render_admin_panel
10
+ from auth import authenticate, login, logout
11
+ from audio_loader import copy_to_clean, load_audio_bytes
12
+ from config import load_config
13
+ from csv_persistence import save_label
14
+ from models import LabelRecord
15
+ from reference import load_reference
16
+ from resume import build_file_list, compute_resume_index
17
+ from skip_persistence import save_skip
18
+
19
+ # --- Logging Setup ---
20
+ LOG_DIR = os.environ.get("LOG_DIR", "/var/log/audio_labeling_tool")
21
+ os.makedirs(LOG_DIR, exist_ok=True)
22
+
23
+ logging.basicConfig(
24
+ level=logging.INFO,
25
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
26
+ handlers=[
27
+ logging.FileHandler(os.path.join(LOG_DIR, "app.log")),
28
+ logging.StreamHandler(),
29
+ ],
30
+ )
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # --- Page Config ---
34
+ st.set_page_config(page_title="Audio Labeling Tool", layout="wide")
35
+
36
+
37
+ def init_session_state():
38
+ """Initialize session state defaults."""
39
+ if "authenticated" not in st.session_state:
40
+ st.session_state["authenticated"] = False
41
+ if "username" not in st.session_state:
42
+ st.session_state["username"] = None
43
+ if "role" not in st.session_state:
44
+ st.session_state["role"] = None
45
+
46
+
47
+ def render_login():
48
+ """Render the login view."""
49
+ st.title("Audio Labeling Tool")
50
+ st.subheader("Login")
51
+
52
+ username = st.text_input("Username", key="login_username")
53
+ password = st.text_input("Password", type="password", key="login_password")
54
+
55
+ if st.button("Login"):
56
+ if not username or not password:
57
+ st.error("Please enter both username and password.")
58
+ return
59
+
60
+ try:
61
+ role = authenticate(username, password)
62
+ if role:
63
+ login(username, role)
64
+ st.rerun()
65
+ else:
66
+ st.error("Invalid username or password.")
67
+ except Exception:
68
+ st.error("Invalid username or password.")
69
+
70
+
71
+ def get_labeler_config(username: str) -> dict:
72
+ """Get the configuration for the current labeler."""
73
+ config = load_config()
74
+ return config["labelers"][username]
75
+
76
+
77
+ def initialize_labeling_session(username: str):
78
+ """Initialize the labeling session: build file list, load reference, compute resume."""
79
+ if "file_list" in st.session_state:
80
+ return # Already initialized
81
+
82
+ labeler_cfg = get_labeler_config(username)
83
+ audio_folder = labeler_cfg["audio_folder"]
84
+ reference_json = labeler_cfg["reference_json"]
85
+ output_dir = labeler_cfg["output_dir"]
86
+ csv_path = os.path.join(output_dir, f"{username}_metadata.csv")
87
+
88
+ # Build file list
89
+ file_list = build_file_list(audio_folder)
90
+ st.session_state["file_list"] = file_list
91
+ st.session_state["audio_folder"] = audio_folder
92
+ st.session_state["csv_path"] = csv_path
93
+ st.session_state["output_dir"] = output_dir
94
+ st.session_state["clean_audios_dir"] = labeler_cfg["clean_audios_dir"]
95
+
96
+ # Load reference JSON
97
+ try:
98
+ reference = load_reference(reference_json)
99
+ st.session_state["reference"] = reference
100
+ st.session_state["reference_error"] = None
101
+ except (FileNotFoundError, ValueError) as e:
102
+ st.session_state["reference"] = {}
103
+ st.session_state["reference_error"] = str(e)
104
+
105
+ # Compute resume index
106
+ resume_index = compute_resume_index(file_list, csv_path, username)
107
+ st.session_state["current_index"] = resume_index
108
+
109
+
110
+ def render_audio_player(audio_bytes: bytes):
111
+ """Render HTML5 audio player with speed control."""
112
+ audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
113
+ audio_html = f"""
114
+ <audio id="audio-player" controls style="width: 100%;">
115
+ <source src="data:audio/wav;base64,{audio_b64}" type="audio/wav">
116
+ Your browser does not support the audio element.
117
+ </audio>
118
+ <script>
119
+ var audio = document.getElementById('audio-player');
120
+ var rate = document.getElementById('playback-rate');
121
+ if (rate) {{
122
+ audio.playbackRate = parseFloat(rate.value);
123
+ }}
124
+ </script>
125
+ """
126
+ st.markdown(audio_html, unsafe_allow_html=True)
127
+
128
+
129
+ def render_labeling_ui():
130
+ """Render the main labeling interface."""
131
+ username = st.session_state["username"]
132
+
133
+ # Header with logout
134
+ col_title, col_logout = st.columns([4, 1])
135
+ with col_title:
136
+ st.title("Audio Labeling Tool")
137
+ with col_logout:
138
+ if st.button("Logout"):
139
+ logout()
140
+ st.rerun()
141
+
142
+ # Initialize session
143
+ initialize_labeling_session(username)
144
+
145
+ # Check for reference loading error
146
+ if st.session_state.get("reference_error"):
147
+ st.error(
148
+ "Reference file is corrupted or missing. Please contact admin."
149
+ )
150
+ st.stop()
151
+
152
+ file_list = st.session_state["file_list"]
153
+ current_index = st.session_state["current_index"]
154
+ audio_folder = st.session_state["audio_folder"]
155
+ reference = st.session_state["reference"]
156
+
157
+ # Handle completion
158
+ if not file_list:
159
+ st.warning("No audio files found in your assigned folder.")
160
+ st.stop()
161
+
162
+ if current_index >= len(file_list):
163
+ st.success("All items have been labeled! You're done.")
164
+ if st.button("← Go to last item"):
165
+ st.session_state["current_index"] = len(file_list) - 1
166
+ st.rerun()
167
+ st.stop()
168
+
169
+ # Current file info
170
+ current_filename = file_list[current_index]
171
+
172
+ # Position indicator
173
+ st.markdown(f"**{current_index + 1} / {len(file_list)}** — `{current_filename}`")
174
+
175
+ # Audio player
176
+ try:
177
+ audio_bytes = load_audio_bytes(audio_folder, current_filename)
178
+ render_audio_player(audio_bytes)
179
+ except FileNotFoundError:
180
+ st.error(f"Audio file not found: {current_filename}. Please contact admin.")
181
+
182
+ # Speed control
183
+ speed = st.select_slider(
184
+ "Playback Speed",
185
+ options=[0.5, 0.75, 1.0, 1.25, 1.5, 2.0],
186
+ value=1.0,
187
+ key="speed_slider",
188
+ )
189
+ # Update playback rate via JS
190
+ st.markdown(
191
+ f"""<script>
192
+ var audio = document.getElementById('audio-player');
193
+ if (audio) {{ audio.playbackRate = {speed}; }}
194
+ </script>""",
195
+ unsafe_allow_html=True,
196
+ )
197
+
198
+ st.divider()
199
+
200
+ # Transcription
201
+ default_transcription = reference.get(current_filename, "")
202
+ if not default_transcription and current_filename not in reference:
203
+ st.warning(f"No transcription found for: {current_filename}")
204
+
205
+ transcription = st.text_area(
206
+ "Transcription",
207
+ value=default_transcription,
208
+ height=100,
209
+ key=f"transcription_{current_index}",
210
+ )
211
+
212
+ # Metadata
213
+ col_gender, col_pii = st.columns(2)
214
+ with col_gender:
215
+ gender = st.radio(
216
+ "Gender",
217
+ options=["male", "female"],
218
+ key=f"gender_{current_index}",
219
+ )
220
+ with col_pii:
221
+ pii = st.checkbox("Contains PII", key=f"pii_{current_index}")
222
+
223
+ st.divider()
224
+
225
+ # Navigation and action buttons
226
+ col_prev, col_next, col_apply, col_skip = st.columns(4)
227
+
228
+ with col_prev:
229
+ prev_disabled = current_index <= 0
230
+ if st.button("← Previous", disabled=prev_disabled):
231
+ st.session_state["current_index"] = current_index - 1
232
+ st.rerun()
233
+
234
+ with col_next:
235
+ next_disabled = current_index >= len(file_list) - 1
236
+ if st.button("Next →", disabled=next_disabled):
237
+ st.session_state["current_index"] = current_index + 1
238
+ st.rerun()
239
+
240
+ with col_apply:
241
+ if st.button("✓ Apply", type="primary"):
242
+ # Build record
243
+ record = LabelRecord(
244
+ source=current_filename,
245
+ transcription=transcription,
246
+ gender=gender,
247
+ pii=pii,
248
+ labeler=username,
249
+ )
250
+
251
+ csv_path = st.session_state["csv_path"]
252
+ clean_audios_dir = st.session_state["clean_audios_dir"]
253
+
254
+ try:
255
+ # Save to CSV
256
+ save_label(record, csv_path)
257
+ # Copy audio to clean folder
258
+ copy_to_clean(audio_folder, current_filename, clean_audios_dir)
259
+ # Advance pointer only on success
260
+ st.session_state["current_index"] = current_index + 1
261
+ st.rerun()
262
+ except IOError as e:
263
+ st.error(str(e))
264
+ # Pointer NOT advanced
265
+
266
+ with col_skip:
267
+ skip_disabled = current_index >= len(file_list) - 1
268
+ if st.button("Skip ✗", disabled=skip_disabled):
269
+ st.session_state["show_skip_reason"] = True
270
+ st.rerun()
271
+
272
+ # Skip reason dialog
273
+ if st.session_state.get("show_skip_reason", False):
274
+ st.divider()
275
+ st.markdown("**Why are you skipping this audio?**")
276
+
277
+ config = load_config()
278
+ skip_reasons = config.get("skip_reasons", ["Other"])
279
+
280
+ reason_choice = st.selectbox(
281
+ "Select reason",
282
+ options=skip_reasons,
283
+ key=f"skip_reason_select_{current_index}",
284
+ )
285
+
286
+ custom_reason = ""
287
+ if reason_choice == "Other":
288
+ custom_reason = st.text_input(
289
+ "Please specify:", key=f"skip_custom_reason_{current_index}"
290
+ )
291
+
292
+ col_confirm, col_cancel = st.columns(2)
293
+ with col_confirm:
294
+ if st.button("Confirm Skip"):
295
+ final_reason = custom_reason if reason_choice == "Other" else reason_choice
296
+ if reason_choice == "Other" and not custom_reason.strip():
297
+ st.error("Please provide a reason.")
298
+ else:
299
+ shared_output_dir = config["shared_output_dir"]
300
+ skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
301
+ try:
302
+ save_skip(username, current_filename, final_reason, skip_csv_path)
303
+ st.session_state["show_skip_reason"] = False
304
+ st.session_state["current_index"] = current_index + 1
305
+ st.rerun()
306
+ except IOError as e:
307
+ st.error(str(e))
308
+
309
+ with col_cancel:
310
+ if st.button("Cancel"):
311
+ st.session_state["show_skip_reason"] = False
312
+ st.rerun()
313
+
314
+
315
+ def main():
316
+ """Main application entry point."""
317
+ init_session_state()
318
+
319
+ if st.session_state["authenticated"]:
320
+ role = st.session_state.get("role")
321
+ if role == "admin":
322
+ render_admin_panel()
323
+ else:
324
+ render_labeling_ui()
325
+ else:
326
+ render_login()
327
+
328
+
329
+ if __name__ == "__main__":
330
+ main()
audio_loader.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Lazy per-item audio loading and clean audio copy."""
2
+
3
+ import shutil
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def load_audio_bytes(folder_path: str, filename: str) -> bytes:
11
+ """Load audio bytes for a single file on demand.
12
+
13
+ Args:
14
+ folder_path: Path to the audio folder.
15
+ filename: Name of the WAV file.
16
+
17
+ Returns:
18
+ Raw bytes of the audio file.
19
+
20
+ Raises:
21
+ FileNotFoundError: If the audio file does not exist.
22
+ """
23
+ audio_path = Path(folder_path) / filename
24
+ if not audio_path.exists():
25
+ raise FileNotFoundError(f"Audio file not found: {filename}")
26
+
27
+ return audio_path.read_bytes()
28
+
29
+
30
+ def copy_to_clean(source_folder: str, filename: str, clean_audios_dir: str) -> None:
31
+ """Copy accepted audio to the clean audios folder.
32
+
33
+ Creates the clean audios directory if it doesn't exist.
34
+
35
+ Args:
36
+ source_folder: Path to the source audio folder.
37
+ filename: Name of the WAV file to copy.
38
+ clean_audios_dir: Path to the destination clean audios folder.
39
+
40
+ Raises:
41
+ IOError: If the copy fails.
42
+ """
43
+ src = Path(source_folder) / filename
44
+ dst_dir = Path(clean_audios_dir)
45
+ dst_dir.mkdir(parents=True, exist_ok=True)
46
+ dst = dst_dir / filename
47
+
48
+ try:
49
+ shutil.copy2(str(src), str(dst))
50
+ logger.info(f"Copied '{filename}' to clean audios: {clean_audios_dir}")
51
+ except Exception as e:
52
+ logger.error(f"Failed to copy '{filename}' to clean audios: {e}")
53
+ raise IOError(f"Failed to copy audio file '{filename}'.") from e
auth.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Authentication logic for the Audio Labeling Tool."""
2
+
3
+ import logging
4
+
5
+ import streamlit as st
6
+
7
+ from config import load_config
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def authenticate(username: str, password: str) -> str | None:
13
+ """Validate credentials against config.
14
+
15
+ Returns:
16
+ "admin" if admin credentials, "labeler" if labeler credentials, None if invalid.
17
+ """
18
+ config = load_config()
19
+
20
+ # Check admin
21
+ admin_cfg = config["admin"]
22
+ if username == admin_cfg["username"] and password == admin_cfg["password"]:
23
+ return "admin"
24
+
25
+ # Check labelers
26
+ labeler = config["labelers"].get(username)
27
+ if labeler is not None and labeler["password"] == password:
28
+ return "labeler"
29
+
30
+ return None
31
+
32
+
33
+ def login(username: str, role: str) -> None:
34
+ """Set session state on successful login."""
35
+ st.session_state["authenticated"] = True
36
+ st.session_state["username"] = username
37
+ st.session_state["role"] = role
38
+ logger.info(f"User '{username}' logged in as '{role}'.")
39
+
40
+
41
+ def logout() -> None:
42
+ """Clear all session state and return to login."""
43
+ username = st.session_state.get("username", "unknown")
44
+ for key in list(st.session_state.keys()):
45
+ del st.session_state[key]
46
+ st.session_state["authenticated"] = False
47
+ logger.info(f"User '{username}' logged out.")
config.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration loading and validation for the Audio Labeling Tool."""
2
+
3
+ import os
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ import yaml
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ DEFAULT_CONFIG_PATH = "/app/config.yaml"
12
+
13
+
14
+ def load_config() -> dict:
15
+ """Load application configuration from YAML file.
16
+
17
+ Config path resolution:
18
+ 1. ALT_CONFIG_PATH environment variable
19
+ 2. Default path: /app/config.yaml
20
+ """
21
+ config_path = os.environ.get("ALT_CONFIG_PATH", DEFAULT_CONFIG_PATH)
22
+ path = Path(config_path)
23
+
24
+ if not path.exists():
25
+ raise FileNotFoundError(f"Configuration file not found: {config_path}")
26
+
27
+ with open(path, "r", encoding="utf-8") as f:
28
+ config = yaml.safe_load(f)
29
+
30
+ _validate_config(config)
31
+ return config
32
+
33
+
34
+ def _validate_config(config: dict) -> None:
35
+ """Validate config structure."""
36
+ if "labelers" not in config:
37
+ raise ValueError("Config must contain 'labelers' key")
38
+ if len(config["labelers"]) != 2:
39
+ raise ValueError("Exactly 2 labelers must be configured")
40
+ for name, labeler_cfg in config["labelers"].items():
41
+ if "password" not in labeler_cfg:
42
+ raise ValueError(f"Labeler '{name}' missing 'password'")
43
+ if "audio_folder" not in labeler_cfg:
44
+ raise ValueError(f"Labeler '{name}' missing 'audio_folder'")
45
+ if "reference_json" not in labeler_cfg:
46
+ raise ValueError(f"Labeler '{name}' missing 'reference_json'")
47
+ if "output_dir" not in labeler_cfg:
48
+ raise ValueError(f"Labeler '{name}' missing 'output_dir'")
49
+ if "clean_audios_dir" not in labeler_cfg:
50
+ raise ValueError(f"Labeler '{name}' missing 'clean_audios_dir'")
51
+ if "admin" not in config:
52
+ raise ValueError("Config must contain 'admin' key")
53
+ if "username" not in config["admin"] or "password" not in config["admin"]:
54
+ raise ValueError("Admin must have 'username' and 'password'")
55
+ if "shared_output_dir" not in config:
56
+ raise ValueError("Config must contain 'shared_output_dir'")
config.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ labelers:
2
+ labeler_a:
3
+ password: "password_a"
4
+ audio_folder: "/data/audio/folder_A"
5
+ reference_json: "/data/references/folder_A.json"
6
+ output_dir: "/data/output/labeler_A"
7
+ clean_audios_dir: "/data/output/labeler_A/audios_A"
8
+ labeler_b:
9
+ password: "password_b"
10
+ audio_folder: "/data/audio/folder_B"
11
+ reference_json: "/data/references/folder_B.json"
12
+ output_dir: "/data/output/labeler_B"
13
+ clean_audios_dir: "/data/output/labeler_B/audios_B"
14
+
15
+ admin:
16
+ username: "admin"
17
+ password: "admin_password"
18
+
19
+ skip_reasons:
20
+ - "Noisy audio"
21
+ - "Too short"
22
+ - "Unintelligible"
23
+ - "Wrong language"
24
+ - "Silence / no speech"
25
+ - "Corrupted file"
26
+ - "Other"
27
+
28
+ shared_output_dir: "/data/output"
csv_persistence.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CSV persistence layer with file locking and atomic writes."""
2
+
3
+ import os
4
+ import csv
5
+ import tempfile
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ from filelock import FileLock, Timeout
10
+
11
+ from models import LabelRecord
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ CSV_COLUMNS = ["source", "transcription", "gender", "pii", "labeler"]
16
+
17
+
18
+ def save_label(record: LabelRecord, csv_path: str) -> None:
19
+ """Save a label record to CSV with file locking and atomic write.
20
+
21
+ Uses filelock for serialization and temp-file-then-rename for atomicity.
22
+ Implements upsert: overwrites existing row for same source+labeler,
23
+ or appends if new.
24
+
25
+ Args:
26
+ record: The label record to save.
27
+ csv_path: Path to the target CSV file.
28
+
29
+ Raises:
30
+ IOError: If the write operation fails.
31
+ """
32
+ lock_path = csv_path + ".lock"
33
+ lock = FileLock(lock_path, timeout=10)
34
+
35
+ try:
36
+ with lock:
37
+ # Read existing data
38
+ rows: list[dict] = []
39
+ path = Path(csv_path)
40
+ if path.exists():
41
+ with open(path, "r", encoding="utf-8", newline="") as f:
42
+ reader = csv.DictReader(f)
43
+ rows = [row for row in reader]
44
+
45
+ # Upsert: overwrite existing row for this source+labeler, or append
46
+ new_row = {
47
+ "source": record.source,
48
+ "transcription": record.transcription,
49
+ "gender": record.gender,
50
+ "pii": str(record.pii),
51
+ "labeler": record.labeler,
52
+ }
53
+
54
+ updated = False
55
+ for i, row in enumerate(rows):
56
+ if row["source"] == record.source and row["labeler"] == record.labeler:
57
+ rows[i] = new_row
58
+ updated = True
59
+ break
60
+
61
+ if not updated:
62
+ rows.append(new_row)
63
+
64
+ # Write to temp file, then atomic rename
65
+ dir_name = os.path.dirname(csv_path) or "."
66
+ os.makedirs(dir_name, exist_ok=True)
67
+ fd, tmp_path = tempfile.mkstemp(dir=dir_name, suffix=".tmp")
68
+ try:
69
+ with os.fdopen(fd, "w", encoding="utf-8", newline="") as f:
70
+ writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
71
+ writer.writeheader()
72
+ writer.writerows(rows)
73
+ os.replace(tmp_path, csv_path)
74
+ logger.info(f"Saved label for '{record.source}' by '{record.labeler}'")
75
+ except Exception:
76
+ # Clean up temp file on failure
77
+ if os.path.exists(tmp_path):
78
+ os.unlink(tmp_path)
79
+ raise
80
+
81
+ except Timeout:
82
+ logger.error(f"Lock timeout for CSV: {csv_path}")
83
+ raise IOError("Failed to save annotation (file locked). Please try again.")
84
+ except IOError:
85
+ raise
86
+ except Exception as e:
87
+ logger.error(f"Failed to save label for {record.source}: {e}")
88
+ raise IOError("Failed to save annotation. Please try again.") from e
data/audio/folder_A/10.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce48e25a4d9cbe9431b0b689b15ed16ea7c5e6e350a5b93fb89343b67f1dc5ac
3
+ size 347244
data/audio/folder_A/11.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:306a6cc6e882a689a93d90ccb1f1e62f7ce07437ab127499518760b45890de53
3
+ size 26284
data/audio/folder_B/12.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc6763766efe21c680ea048e7d1e0e6a51ec61d29fcac1b82c9d0bc18fdaa5d5
3
+ size 45484
data/audio/folder_B/13.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fea62d7359eccc0ec083b7468bf52369a53ead8ffcd74c435915c50e772d2e7
3
+ size 69484
data/audio/references/folder_A.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "10.wav": "some transcription",
3
+ "11.wav": "another transcription"
4
+ }
data/audio/references/folder_B.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "clip12_001.wav": "12 transcription",
3
+ "clip13_002.wav": "13 transcription"
4
+ }
docker-compose.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.8"
2
+
3
+ services:
4
+ audio-labeling-tool:
5
+ build: .
6
+ ports:
7
+ - "8501:8501"
8
+ volumes:
9
+ - /data/audio:/data/audio:ro
10
+ - /data/references:/data/references:ro
11
+ - /data/output:/data/output:rw
12
+ - ./config.yaml:/app/config.yaml:ro
13
+ environment:
14
+ - ALT_CONFIG_PATH=/app/config.yaml
15
+ - LOG_DIR=/app/logs
16
+ restart: unless-stopped
models.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data models for the Audio Labeling Tool."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import TypedDict
5
+
6
+
7
+ class LabelRow(TypedDict):
8
+ """TypedDict representing a single row in the metadata CSV."""
9
+
10
+ source: str # Audio filename only (e.g., "clip_001.wav")
11
+ transcription: str # Corrected transcription text
12
+ gender: str # "male" or "female"
13
+ pii: str # "True" or "False" (string representation in CSV)
14
+ labeler: str # Username of the labeler
15
+
16
+
17
+ @dataclass
18
+ class LabelRecord:
19
+ """Dataclass for constructing a label before saving."""
20
+
21
+ source: str
22
+ transcription: str
23
+ gender: str
24
+ pii: bool
25
+ labeler: str
reference.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Reference JSON loading and validation."""
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def load_reference(json_path: str) -> dict[str, str]:
11
+ """Load and validate a reference JSON file.
12
+
13
+ Args:
14
+ json_path: Path to the reference JSON file.
15
+
16
+ Returns:
17
+ Dictionary mapping audio filenames to transcription strings.
18
+
19
+ Raises:
20
+ FileNotFoundError: If the JSON file does not exist.
21
+ ValueError: If the JSON is malformed or not a flat dict[str, str].
22
+ """
23
+ path = Path(json_path)
24
+ if not path.exists():
25
+ raise FileNotFoundError(f"Reference file not found: {json_path}")
26
+
27
+ try:
28
+ with open(path, "r", encoding="utf-8") as f:
29
+ data = json.load(f)
30
+ except json.JSONDecodeError:
31
+ raise ValueError(f"Malformed JSON in {json_path}: unable to parse")
32
+
33
+ if not isinstance(data, dict):
34
+ raise ValueError(
35
+ f"Reference JSON must be a flat dict, got {type(data).__name__}"
36
+ )
37
+
38
+ for key, value in data.items():
39
+ if not isinstance(key, str) or not isinstance(value, str):
40
+ raise ValueError("All keys and values in reference JSON must be strings")
41
+
42
+ logger.info(f"Loaded reference JSON: {json_path} ({len(data)} entries)")
43
+ return data
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit>=1.28
2
+ filelock>=3.12
3
+ pyyaml>=6.0
resume.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Checkpoint and resume logic."""
2
+
3
+ import csv
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def build_file_list(audio_folder: str) -> list[str]:
11
+ """Build an ordered list of WAV filenames from the assigned audio folder.
12
+
13
+ Files are sorted alphabetically.
14
+
15
+ Args:
16
+ audio_folder: Path to the labeler's audio folder.
17
+
18
+ Returns:
19
+ Sorted list of WAV filenames.
20
+ """
21
+ folder_path = Path(audio_folder)
22
+ if not folder_path.exists():
23
+ logger.warning(f"Audio folder not found: {audio_folder}")
24
+ return []
25
+
26
+ wav_files = sorted(
27
+ f.name
28
+ for f in folder_path.iterdir()
29
+ if f.suffix.lower() == ".wav" and f.is_file()
30
+ )
31
+ logger.info(f"Found {len(wav_files)} WAV files in {audio_folder}")
32
+ return wav_files
33
+
34
+
35
+ def compute_resume_index(
36
+ file_list: list[str],
37
+ csv_path: str,
38
+ labeler: str,
39
+ ) -> int:
40
+ """Compute the index of the first unlabeled item.
41
+
42
+ Args:
43
+ file_list: Ordered list of audio filenames.
44
+ csv_path: Path to the labeler's CSV file.
45
+ labeler: The labeler's username.
46
+
47
+ Returns:
48
+ Index of the first unlabeled file, or len(file_list) if all are labeled.
49
+ """
50
+ labeled_sources: set[str] = set()
51
+
52
+ path = Path(csv_path)
53
+ if path.exists():
54
+ try:
55
+ with open(path, "r", encoding="utf-8", newline="") as f:
56
+ reader = csv.DictReader(f)
57
+ for row in reader:
58
+ if row.get("labeler") == labeler:
59
+ labeled_sources.add(row["source"])
60
+ except Exception as e:
61
+ logger.warning(f"Could not read CSV for resume: {e}")
62
+
63
+ for i, filename in enumerate(file_list):
64
+ if filename not in labeled_sources:
65
+ return i
66
+
67
+ return len(file_list) # All items labeled
skip_persistence.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Skip tracking persistence layer."""
2
+
3
+ import os
4
+ import csv
5
+ import tempfile
6
+ import logging
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+
10
+ from filelock import FileLock, Timeout
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ SKIP_CSV_COLUMNS = ["labeler", "source", "reason", "timestamp"]
15
+
16
+
17
+ def save_skip(labeler: str, source: str, reason: str, csv_path: str) -> None:
18
+ """Save a skip record to the shared skipped_audios.csv.
19
+
20
+ Uses filelock + temp-file-then-rename for atomicity.
21
+
22
+ Args:
23
+ labeler: Username of the labeler who skipped.
24
+ source: Audio filename.
25
+ reason: Reason for skipping.
26
+ csv_path: Path to the shared skipped_audios.csv.
27
+
28
+ Raises:
29
+ IOError: If the write operation fails.
30
+ """
31
+ lock_path = csv_path + ".lock"
32
+ lock = FileLock(lock_path, timeout=10)
33
+
34
+ try:
35
+ with lock:
36
+ rows: list[dict] = []
37
+ path = Path(csv_path)
38
+ if path.exists():
39
+ with open(path, "r", encoding="utf-8", newline="") as f:
40
+ reader = csv.DictReader(f)
41
+ rows = [row for row in reader]
42
+
43
+ new_row = {
44
+ "labeler": labeler,
45
+ "source": source,
46
+ "reason": reason,
47
+ "timestamp": datetime.now().isoformat(),
48
+ }
49
+
50
+ # Upsert: overwrite if same labeler+source already skipped
51
+ updated = False
52
+ for i, row in enumerate(rows):
53
+ if row["labeler"] == labeler and row["source"] == source:
54
+ rows[i] = new_row
55
+ updated = True
56
+ break
57
+
58
+ if not updated:
59
+ rows.append(new_row)
60
+
61
+ # Atomic write
62
+ dir_name = os.path.dirname(csv_path) or "."
63
+ os.makedirs(dir_name, exist_ok=True)
64
+ fd, tmp_path = tempfile.mkstemp(dir=dir_name, suffix=".tmp")
65
+ try:
66
+ with os.fdopen(fd, "w", encoding="utf-8", newline="") as f:
67
+ writer = csv.DictWriter(f, fieldnames=SKIP_CSV_COLUMNS)
68
+ writer.writeheader()
69
+ writer.writerows(rows)
70
+ os.replace(tmp_path, csv_path)
71
+ logger.info(f"Saved skip for '{source}' by '{labeler}': {reason}")
72
+ except Exception:
73
+ if os.path.exists(tmp_path):
74
+ os.unlink(tmp_path)
75
+ raise
76
+
77
+ except Timeout:
78
+ logger.error(f"Lock timeout for skip CSV: {csv_path}")
79
+ raise IOError("Failed to save skip record. Please try again.")
80
+ except IOError:
81
+ raise
82
+ except Exception as e:
83
+ logger.error(f"Failed to save skip for {source}: {e}")
84
+ raise IOError("Failed to save skip record. Please try again.") from e