Spaces:

valiyevfagan
/

test_ui2

Sleeping

App Files Files Community

Fagan Valiyev commited on May 8

Commit

d7efa84

0 Parent(s):

initial

Browse files

Files changed (22) hide show

.gitattributes +1 -0
Dockerfile +12 -0
README.md +90 -0
admin_panel.py +244 -0
app.py +330 -0
audio_loader.py +53 -0
auth.py +47 -0
config.py +56 -0
config.yaml +28 -0
csv_persistence.py +88 -0
data/audio/folder_A/10.wav +3 -0
data/audio/folder_A/11.wav +3 -0
data/audio/folder_B/12.wav +3 -0
data/audio/folder_B/13.wav +3 -0
data/audio/references/folder_A.json +4 -0
data/audio/references/folder_B.json +4 -0
docker-compose.yaml +16 -0
models.py +25 -0
reference.py +43 -0
requirements.txt +3 -0
resume.py +67 -0
skip_persistence.py +84 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.wav filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 8501
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true"]

README.md ADDED Viewed

	@@ -0,0 +1,90 @@

+# Audio Labeling Tool
+A Streamlit application for transcription correction and metadata annotation by data labelers.
+## Requirements
+- Python 3.10+
+- Docker & Docker Compose (for deployment)
+## Filesystem Structure
+```
+/data/
+├── audio/
+│   ├── folder_A/           # Labeler A's source WAV files
+│   └── folder_B/           # Labeler B's source WAV files
+├── references/
+│   ├── folder_A.json       # {filename: transcription} for folder_A
+│   └── folder_B.json       # {filename: transcription} for folder_B
+└── output/
+    ├── labeler_A/
+    │   ├── audios_A/       # Clean audios (copied on accept)
+    │   └── labeler_a_metadata.csv
+    └── labeler_B/
+        ├── audios_B/       # Clean audios (copied on accept)
+        └── labeler_b_metadata.csv
+```
+## Configuration
+Edit `config.yaml` to set labeler credentials and paths:
+```yaml
+labelers:
+  labeler_a:
+    password: "your_password"
+    audio_folder: "/data/audio/folder_A"
+    reference_json: "/data/references/folder_A.json"
+    output_dir: "/data/output/labeler_A"
+    clean_audios_dir: "/data/output/labeler_A/audios_A"
+  labeler_b:
+    password: "your_password"
+    audio_folder: "/data/audio/folder_B"
+    reference_json: "/data/references/folder_B.json"
+    output_dir: "/data/output/labeler_B"
+    clean_audios_dir: "/data/output/labeler_B/audios_B"
+```
+## Environment Variables
+| Variable | Purpose | Default |
+|----------|---------|---------|
+| `ALT_CONFIG_PATH` | Path to config.yaml | `/app/config.yaml` |
+| `LOG_DIR` | Directory for log files | `/var/log/audio_labeling_tool` |
+## Deployment with Docker
+```bash
+# Build and run
+docker-compose up -d --build
+# View logs
+docker-compose logs -f
+```
+The app will be available at `http://<server-ip>:8501`.
+## Volume Mounts
+| Host Path | Container Path | Mode |
+|-----------|---------------|------|
+| `/data/audio` | `/data/audio` | read-only |
+| `/data/references` | `/data/references` | read-only |
+| `/data/output` | `/data/output` | read-write |
+| `./config.yaml` | `/app/config.yaml` | read-only |
+## Filesystem Permissions
+- Audio folders: read-only for the app process
+- Output directory: read-write for the app process
+- Config file: read-only
+## Local Development
+```bash
+pip install -r requirements.txt
+export ALT_CONFIG_PATH=./config.yaml
+export LOG_DIR=./logs
+streamlit run app.py
+```

admin_panel.py ADDED Viewed

	@@ -0,0 +1,244 @@

+"""Admin panel for tracking labeler progress and activity."""
+import csv
+import logging
+import os
+from pathlib import Path
+import streamlit as st
+from config import load_config
+logger = logging.getLogger(__name__)
+def render_admin_panel():
+    """Render the admin dashboard."""
+    config = load_config()
+    # Header with logout
+    col_title, col_logout = st.columns([4, 1])
+    with col_title:
+        st.title("Admin Panel")
+    with col_logout:
+        from auth import logout
+        if st.button("Logout"):
+            logout()
+            st.rerun()
+    # Tabs for different views
+    tab_progress, tab_metadata, tab_skipped, tab_activity, tab_downloads = st.tabs(
+        ["Progress", "Metadata CSVs", "Skipped Audios", "Activity Log", "Downloads"]
+    )
+    with tab_progress:
+        _render_progress(config)
+    with tab_metadata:
+        _render_metadata_viewer(config)
+    with tab_skipped:
+        _render_skipped_viewer(config)
+    with tab_activity:
+        _render_activity_log(config)
+    with tab_downloads:
+        _render_downloads(config)
+def _read_csv_rows(csv_path: str) -> list[dict]:
+    """Read all rows from a CSV file."""
+    path = Path(csv_path)
+    if not path.exists():
+        return []
+    try:
+        with open(path, "r", encoding="utf-8", newline="") as f:
+            reader = csv.DictReader(f)
+            return [row for row in reader]
+    except Exception as e:
+        logger.warning(f"Could not read CSV {csv_path}: {e}")
+        return []
+def _count_audio_files(audio_folder: str) -> int:
+    """Count WAV files in a folder."""
+    folder = Path(audio_folder)
+    if not folder.exists():
+        return 0
+    return sum(1 for f in folder.iterdir() if f.suffix.lower() == ".wav" and f.is_file())
+def _render_progress(config: dict):
+    """Render progress tracking for each labeler."""
+    st.subheader("Labeler Progress")
+    shared_output_dir = config["shared_output_dir"]
+    skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
+    skip_rows = _read_csv_rows(skip_csv_path)
+    for labeler_name, labeler_cfg in config["labelers"].items():
+        st.markdown(f"### {labeler_name}")
+        total_files = _count_audio_files(labeler_cfg["audio_folder"])
+        csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
+        labeled_rows = _read_csv_rows(csv_path)
+        labeled_count = len(labeled_rows)
+        # Count skips for this labeler
+        skipped_count = sum(1 for r in skip_rows if r.get("labeler") == labeler_name)
+        remaining = max(0, total_files - labeled_count - skipped_count)
+        col1, col2, col3, col4 = st.columns(4)
+        col1.metric("Total", total_files)
+        col2.metric("Labeled", labeled_count)
+        col3.metric("Skipped", skipped_count)
+        col4.metric("Remaining", remaining)
+        if total_files > 0:
+            progress = (labeled_count + skipped_count) / total_files
+            st.progress(min(progress, 1.0))
+        else:
+            st.progress(0.0)
+        st.divider()
+def _render_metadata_viewer(config: dict):
+    """Render metadata CSV viewer for each labeler."""
+    st.subheader("Metadata CSVs")
+    for labeler_name, labeler_cfg in config["labelers"].items():
+        csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
+        rows = _read_csv_rows(csv_path)
+        with st.expander(f"{labeler_name} — {len(rows)} entries", expanded=False):
+            if rows:
+                st.dataframe(rows, use_container_width=True)
+            else:
+                st.info("No labels recorded yet.")
+def _render_skipped_viewer(config: dict):
+    """Render skipped audios CSV viewer."""
+    st.subheader("Skipped Audios")
+    shared_output_dir = config["shared_output_dir"]
+    skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
+    rows = _read_csv_rows(skip_csv_path)
+    if rows:
+        # Filter by labeler
+        labelers = list(config["labelers"].keys())
+        selected_labeler = st.selectbox(
+            "Filter by labeler", options=["All"] + labelers, key="skip_filter"
+        )
+        if selected_labeler != "All":
+            rows = [r for r in rows if r.get("labeler") == selected_labeler]
+        st.dataframe(rows, use_container_width=True)
+        st.caption(f"Total skipped: {len(rows)}")
+    else:
+        st.info("No skipped audios recorded yet.")
+def _render_activity_log(config: dict):
+    """Render recent activity from both metadata and skip CSVs."""
+    st.subheader("Activity Log")
+    # Collect all activity with timestamps
+    activities = []
+    # From metadata CSVs (no timestamp column, so we just show them)
+    for labeler_name, labeler_cfg in config["labelers"].items():
+        csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
+        rows = _read_csv_rows(csv_path)
+        for row in rows:
+            activities.append({
+                "labeler": row.get("labeler", labeler_name),
+                "action": "labeled",
+                "source": row.get("source", ""),
+                "details": f"gender={row.get('gender', '')}, pii={row.get('pii', '')}",
+                "timestamp": "",
+            })
+    # From skip CSV (has timestamps)
+    shared_output_dir = config["shared_output_dir"]
+    skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
+    skip_rows = _read_csv_rows(skip_csv_path)
+    for row in skip_rows:
+        activities.append({
+            "labeler": row.get("labeler", ""),
+            "action": "skipped",
+            "source": row.get("source", ""),
+            "details": row.get("reason", ""),
+            "timestamp": row.get("timestamp", ""),
+        })
+    # Sort by timestamp (skips have timestamps, labels don't — labels go to end)
+    activities.sort(key=lambda x: x["timestamp"] or "0", reverse=True)
+    if activities:
+        # Show last 50
+        st.dataframe(activities[:50], use_container_width=True)
+        st.caption(f"Showing latest {min(50, len(activities))} of {len(activities)} total actions.")
+    else:
+        st.info("No activity recorded yet.")
+def _render_downloads(config: dict):
+    """Render download buttons for CSV files."""
+    st.subheader("Download Reports")
+    # Metadata CSVs
+    for labeler_name, labeler_cfg in config["labelers"].items():
+        csv_path = os.path.join(labeler_cfg["output_dir"], f"{labeler_name}_metadata.csv")
+        path = Path(csv_path)
+        if path.exists():
+            data = path.read_bytes()
+            st.download_button(
+                label=f"📥 {labeler_name}_metadata.csv",
+                data=data,
+                file_name=f"{labeler_name}_metadata.csv",
+                mime="text/csv",
+                key=f"dl_metadata_{labeler_name}",
+            )
+        else:
+            st.caption(f"{labeler_name}_metadata.csv — not yet created")
+    st.divider()
+    # Skipped CSV
+    shared_output_dir = config["shared_output_dir"]
+    skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
+    skip_path = Path(skip_csv_path)
+    if skip_path.exists():
+        data = skip_path.read_bytes()
+        st.download_button(
+            label="📥 skipped_audios.csv",
+            data=data,
+            file_name="skipped_audios.csv",
+            mime="text/csv",
+            key="dl_skipped",
+        )
+    else:
+        st.caption("skipped_audios.csv — not yet created")
+    st.divider()
+    # App log file
+    log_dir = os.environ.get("LOG_DIR", "/var/log/audio_labeling_tool")
+    log_path = Path(log_dir) / "app.log"
+    if log_path.exists():
+        data = log_path.read_bytes()
+        st.download_button(
+            label="📥 app.log",
+            data=data,
+            file_name="app.log",
+            mime="text/plain",
+            key="dl_log",
+        )
+    else:
+        st.caption("app.log — not yet created")

app.py ADDED Viewed

	@@ -0,0 +1,330 @@

+"""Audio Labeling Tool — Streamlit Application Entry Point."""
+import base64
+import logging
+import os
+import streamlit as st
+from admin_panel import render_admin_panel
+from auth import authenticate, login, logout
+from audio_loader import copy_to_clean, load_audio_bytes
+from config import load_config
+from csv_persistence import save_label
+from models import LabelRecord
+from reference import load_reference
+from resume import build_file_list, compute_resume_index
+from skip_persistence import save_skip
+# --- Logging Setup ---
+LOG_DIR = os.environ.get("LOG_DIR", "/var/log/audio_labeling_tool")
+os.makedirs(LOG_DIR, exist_ok=True)
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    handlers=[
+        logging.FileHandler(os.path.join(LOG_DIR, "app.log")),
+        logging.StreamHandler(),
+    ],
+)
+logger = logging.getLogger(__name__)
+# --- Page Config ---
+st.set_page_config(page_title="Audio Labeling Tool", layout="wide")
+def init_session_state():
+    """Initialize session state defaults."""
+    if "authenticated" not in st.session_state:
+        st.session_state["authenticated"] = False
+    if "username" not in st.session_state:
+        st.session_state["username"] = None
+    if "role" not in st.session_state:
+        st.session_state["role"] = None
+def render_login():
+    """Render the login view."""
+    st.title("Audio Labeling Tool")
+    st.subheader("Login")
+    username = st.text_input("Username", key="login_username")
+    password = st.text_input("Password", type="password", key="login_password")
+    if st.button("Login"):
+        if not username or not password:
+            st.error("Please enter both username and password.")
+            return
+        try:
+            role = authenticate(username, password)
+            if role:
+                login(username, role)
+                st.rerun()
+            else:
+                st.error("Invalid username or password.")
+        except Exception:
+            st.error("Invalid username or password.")
+def get_labeler_config(username: str) -> dict:
+    """Get the configuration for the current labeler."""
+    config = load_config()
+    return config["labelers"][username]
+def initialize_labeling_session(username: str):
+    """Initialize the labeling session: build file list, load reference, compute resume."""
+    if "file_list" in st.session_state:
+        return  # Already initialized
+    labeler_cfg = get_labeler_config(username)
+    audio_folder = labeler_cfg["audio_folder"]
+    reference_json = labeler_cfg["reference_json"]
+    output_dir = labeler_cfg["output_dir"]
+    csv_path = os.path.join(output_dir, f"{username}_metadata.csv")
+    # Build file list
+    file_list = build_file_list(audio_folder)
+    st.session_state["file_list"] = file_list
+    st.session_state["audio_folder"] = audio_folder
+    st.session_state["csv_path"] = csv_path
+    st.session_state["output_dir"] = output_dir
+    st.session_state["clean_audios_dir"] = labeler_cfg["clean_audios_dir"]
+    # Load reference JSON
+    try:
+        reference = load_reference(reference_json)
+        st.session_state["reference"] = reference
+        st.session_state["reference_error"] = None
+    except (FileNotFoundError, ValueError) as e:
+        st.session_state["reference"] = {}
+        st.session_state["reference_error"] = str(e)
+    # Compute resume index
+    resume_index = compute_resume_index(file_list, csv_path, username)
+    st.session_state["current_index"] = resume_index
+def render_audio_player(audio_bytes: bytes):
+    """Render HTML5 audio player with speed control."""
+    audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
+    audio_html = f"""
+    <audio id="audio-player" controls style="width: 100%;">
+        <source src="data:audio/wav;base64,{audio_b64}" type="audio/wav">
+        Your browser does not support the audio element.
+    </audio>
+    <script>
+        var audio = document.getElementById('audio-player');
+        var rate = document.getElementById('playback-rate');
+        if (rate) {{
+            audio.playbackRate = parseFloat(rate.value);
+        }}
+    </script>
+    """
+    st.markdown(audio_html, unsafe_allow_html=True)
+def render_labeling_ui():
+    """Render the main labeling interface."""
+    username = st.session_state["username"]
+    # Header with logout
+    col_title, col_logout = st.columns([4, 1])
+    with col_title:
+        st.title("Audio Labeling Tool")
+    with col_logout:
+        if st.button("Logout"):
+            logout()
+            st.rerun()
+    # Initialize session
+    initialize_labeling_session(username)
+    # Check for reference loading error
+    if st.session_state.get("reference_error"):
+        st.error(
+            "Reference file is corrupted or missing. Please contact admin."
+        )
+        st.stop()
+    file_list = st.session_state["file_list"]
+    current_index = st.session_state["current_index"]
+    audio_folder = st.session_state["audio_folder"]
+    reference = st.session_state["reference"]
+    # Handle completion
+    if not file_list:
+        st.warning("No audio files found in your assigned folder.")
+        st.stop()
+    if current_index >= len(file_list):
+        st.success("All items have been labeled! You're done.")
+        if st.button("← Go to last item"):
+            st.session_state["current_index"] = len(file_list) - 1
+            st.rerun()
+        st.stop()
+    # Current file info
+    current_filename = file_list[current_index]
+    # Position indicator
+    st.markdown(f"**{current_index + 1} / {len(file_list)}** — `{current_filename}`")
+    # Audio player
+    try:
+        audio_bytes = load_audio_bytes(audio_folder, current_filename)
+        render_audio_player(audio_bytes)
+    except FileNotFoundError:
+        st.error(f"Audio file not found: {current_filename}. Please contact admin.")
+    # Speed control
+    speed = st.select_slider(
+        "Playback Speed",
+        options=[0.5, 0.75, 1.0, 1.25, 1.5, 2.0],
+        value=1.0,
+        key="speed_slider",
+    )
+    # Update playback rate via JS
+    st.markdown(
+        f"""<script>
+        var audio = document.getElementById('audio-player');
+        if (audio) {{ audio.playbackRate = {speed}; }}
+        </script>""",
+        unsafe_allow_html=True,
+    )
+    st.divider()
+    # Transcription
+    default_transcription = reference.get(current_filename, "")
+    if not default_transcription and current_filename not in reference:
+        st.warning(f"No transcription found for: {current_filename}")
+    transcription = st.text_area(
+        "Transcription",
+        value=default_transcription,
+        height=100,
+        key=f"transcription_{current_index}",
+    )
+    # Metadata
+    col_gender, col_pii = st.columns(2)
+    with col_gender:
+        gender = st.radio(
+            "Gender",
+            options=["male", "female"],
+            key=f"gender_{current_index}",
+        )
+    with col_pii:
+        pii = st.checkbox("Contains PII", key=f"pii_{current_index}")
+    st.divider()
+    # Navigation and action buttons
+    col_prev, col_next, col_apply, col_skip = st.columns(4)
+    with col_prev:
+        prev_disabled = current_index <= 0
+        if st.button("← Previous", disabled=prev_disabled):
+            st.session_state["current_index"] = current_index - 1
+            st.rerun()
+    with col_next:
+        next_disabled = current_index >= len(file_list) - 1
+        if st.button("Next →", disabled=next_disabled):
+            st.session_state["current_index"] = current_index + 1
+            st.rerun()
+    with col_apply:
+        if st.button("✓ Apply", type="primary"):
+            # Build record
+            record = LabelRecord(
+                source=current_filename,
+                transcription=transcription,
+                gender=gender,
+                pii=pii,
+                labeler=username,
+            )
+            csv_path = st.session_state["csv_path"]
+            clean_audios_dir = st.session_state["clean_audios_dir"]
+            try:
+                # Save to CSV
+                save_label(record, csv_path)
+                # Copy audio to clean folder
+                copy_to_clean(audio_folder, current_filename, clean_audios_dir)
+                # Advance pointer only on success
+                st.session_state["current_index"] = current_index + 1
+                st.rerun()
+            except IOError as e:
+                st.error(str(e))
+                # Pointer NOT advanced
+    with col_skip:
+        skip_disabled = current_index >= len(file_list) - 1
+        if st.button("Skip ✗", disabled=skip_disabled):
+            st.session_state["show_skip_reason"] = True
+            st.rerun()
+    # Skip reason dialog
+    if st.session_state.get("show_skip_reason", False):
+        st.divider()
+        st.markdown("**Why are you skipping this audio?**")
+        config = load_config()
+        skip_reasons = config.get("skip_reasons", ["Other"])
+        reason_choice = st.selectbox(
+            "Select reason",
+            options=skip_reasons,
+            key=f"skip_reason_select_{current_index}",
+        )
+        custom_reason = ""
+        if reason_choice == "Other":
+            custom_reason = st.text_input(
+                "Please specify:", key=f"skip_custom_reason_{current_index}"
+            )
+        col_confirm, col_cancel = st.columns(2)
+        with col_confirm:
+            if st.button("Confirm Skip"):
+                final_reason = custom_reason if reason_choice == "Other" else reason_choice
+                if reason_choice == "Other" and not custom_reason.strip():
+                    st.error("Please provide a reason.")
+                else:
+                    shared_output_dir = config["shared_output_dir"]
+                    skip_csv_path = os.path.join(shared_output_dir, "skipped_audios.csv")
+                    try:
+                        save_skip(username, current_filename, final_reason, skip_csv_path)
+                        st.session_state["show_skip_reason"] = False
+                        st.session_state["current_index"] = current_index + 1
+                        st.rerun()
+                    except IOError as e:
+                        st.error(str(e))
+        with col_cancel:
+            if st.button("Cancel"):
+                st.session_state["show_skip_reason"] = False
+                st.rerun()
+def main():
+    """Main application entry point."""
+    init_session_state()
+    if st.session_state["authenticated"]:
+        role = st.session_state.get("role")
+        if role == "admin":
+            render_admin_panel()
+        else:
+            render_labeling_ui()
+    else:
+        render_login()
+if __name__ == "__main__":
+    main()

audio_loader.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""Lazy per-item audio loading and clean audio copy."""
+import shutil
+import logging
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def load_audio_bytes(folder_path: str, filename: str) -> bytes:
+    """Load audio bytes for a single file on demand.
+    Args:
+        folder_path: Path to the audio folder.
+        filename: Name of the WAV file.
+    Returns:
+        Raw bytes of the audio file.
+    Raises:
+        FileNotFoundError: If the audio file does not exist.
+    """
+    audio_path = Path(folder_path) / filename
+    if not audio_path.exists():
+        raise FileNotFoundError(f"Audio file not found: {filename}")
+    return audio_path.read_bytes()
+def copy_to_clean(source_folder: str, filename: str, clean_audios_dir: str) -> None:
+    """Copy accepted audio to the clean audios folder.
+    Creates the clean audios directory if it doesn't exist.
+    Args:
+        source_folder: Path to the source audio folder.
+        filename: Name of the WAV file to copy.
+        clean_audios_dir: Path to the destination clean audios folder.
+    Raises:
+        IOError: If the copy fails.
+    """
+    src = Path(source_folder) / filename
+    dst_dir = Path(clean_audios_dir)
+    dst_dir.mkdir(parents=True, exist_ok=True)
+    dst = dst_dir / filename
+    try:
+        shutil.copy2(str(src), str(dst))
+        logger.info(f"Copied '{filename}' to clean audios: {clean_audios_dir}")
+    except Exception as e:
+        logger.error(f"Failed to copy '{filename}' to clean audios: {e}")
+        raise IOError(f"Failed to copy audio file '{filename}'.") from e

auth.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Authentication logic for the Audio Labeling Tool."""
+import logging
+import streamlit as st
+from config import load_config
+logger = logging.getLogger(__name__)
+def authenticate(username: str, password: str) -> str | None:
+    """Validate credentials against config.
+    Returns:
+        "admin" if admin credentials, "labeler" if labeler credentials, None if invalid.
+    """
+    config = load_config()
+    # Check admin
+    admin_cfg = config["admin"]
+    if username == admin_cfg["username"] and password == admin_cfg["password"]:
+        return "admin"
+    # Check labelers
+    labeler = config["labelers"].get(username)
+    if labeler is not None and labeler["password"] == password:
+        return "labeler"
+    return None
+def login(username: str, role: str) -> None:
+    """Set session state on successful login."""
+    st.session_state["authenticated"] = True
+    st.session_state["username"] = username
+    st.session_state["role"] = role
+    logger.info(f"User '{username}' logged in as '{role}'.")
+def logout() -> None:
+    """Clear all session state and return to login."""
+    username = st.session_state.get("username", "unknown")
+    for key in list(st.session_state.keys()):
+        del st.session_state[key]
+    st.session_state["authenticated"] = False
+    logger.info(f"User '{username}' logged out.")

config.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""Configuration loading and validation for the Audio Labeling Tool."""
+import os
+import logging
+from pathlib import Path
+import yaml
+logger = logging.getLogger(__name__)
+DEFAULT_CONFIG_PATH = "/app/config.yaml"
+def load_config() -> dict:
+    """Load application configuration from YAML file.
+    Config path resolution:
+    1. ALT_CONFIG_PATH environment variable
+    2. Default path: /app/config.yaml
+    """
+    config_path = os.environ.get("ALT_CONFIG_PATH", DEFAULT_CONFIG_PATH)
+    path = Path(config_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Configuration file not found: {config_path}")
+    with open(path, "r", encoding="utf-8") as f:
+        config = yaml.safe_load(f)
+    _validate_config(config)
+    return config
+def _validate_config(config: dict) -> None:
+    """Validate config structure."""
+    if "labelers" not in config:
+        raise ValueError("Config must contain 'labelers' key")
+    if len(config["labelers"]) != 2:
+        raise ValueError("Exactly 2 labelers must be configured")
+    for name, labeler_cfg in config["labelers"].items():
+        if "password" not in labeler_cfg:
+            raise ValueError(f"Labeler '{name}' missing 'password'")
+        if "audio_folder" not in labeler_cfg:
+            raise ValueError(f"Labeler '{name}' missing 'audio_folder'")
+        if "reference_json" not in labeler_cfg:
+            raise ValueError(f"Labeler '{name}' missing 'reference_json'")
+        if "output_dir" not in labeler_cfg:
+            raise ValueError(f"Labeler '{name}' missing 'output_dir'")
+        if "clean_audios_dir" not in labeler_cfg:
+            raise ValueError(f"Labeler '{name}' missing 'clean_audios_dir'")
+    if "admin" not in config:
+        raise ValueError("Config must contain 'admin' key")
+    if "username" not in config["admin"] or "password" not in config["admin"]:
+        raise ValueError("Admin must have 'username' and 'password'")
+    if "shared_output_dir" not in config:
+        raise ValueError("Config must contain 'shared_output_dir'")

config.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+labelers:
+  labeler_a:
+    password: "password_a"
+    audio_folder: "/data/audio/folder_A"
+    reference_json: "/data/references/folder_A.json"
+    output_dir: "/data/output/labeler_A"
+    clean_audios_dir: "/data/output/labeler_A/audios_A"
+  labeler_b:
+    password: "password_b"
+    audio_folder: "/data/audio/folder_B"
+    reference_json: "/data/references/folder_B.json"
+    output_dir: "/data/output/labeler_B"
+    clean_audios_dir: "/data/output/labeler_B/audios_B"
+admin:
+  username: "admin"
+  password: "admin_password"
+skip_reasons:
+  - "Noisy audio"
+  - "Too short"
+  - "Unintelligible"
+  - "Wrong language"
+  - "Silence / no speech"
+  - "Corrupted file"
+  - "Other"
+shared_output_dir: "/data/output"

csv_persistence.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""CSV persistence layer with file locking and atomic writes."""
+import os
+import csv
+import tempfile
+import logging
+from pathlib import Path
+from filelock import FileLock, Timeout
+from models import LabelRecord
+logger = logging.getLogger(__name__)
+CSV_COLUMNS = ["source", "transcription", "gender", "pii", "labeler"]
+def save_label(record: LabelRecord, csv_path: str) -> None:
+    """Save a label record to CSV with file locking and atomic write.
+    Uses filelock for serialization and temp-file-then-rename for atomicity.
+    Implements upsert: overwrites existing row for same source+labeler,
+    or appends if new.
+    Args:
+        record: The label record to save.
+        csv_path: Path to the target CSV file.
+    Raises:
+        IOError: If the write operation fails.
+    """
+    lock_path = csv_path + ".lock"
+    lock = FileLock(lock_path, timeout=10)
+    try:
+        with lock:
+            # Read existing data
+            rows: list[dict] = []
+            path = Path(csv_path)
+            if path.exists():
+                with open(path, "r", encoding="utf-8", newline="") as f:
+                    reader = csv.DictReader(f)
+                    rows = [row for row in reader]
+            # Upsert: overwrite existing row for this source+labeler, or append
+            new_row = {
+                "source": record.source,
+                "transcription": record.transcription,
+                "gender": record.gender,
+                "pii": str(record.pii),
+                "labeler": record.labeler,
+            }
+            updated = False
+            for i, row in enumerate(rows):
+                if row["source"] == record.source and row["labeler"] == record.labeler:
+                    rows[i] = new_row
+                    updated = True
+                    break
+            if not updated:
+                rows.append(new_row)
+            # Write to temp file, then atomic rename
+            dir_name = os.path.dirname(csv_path) or "."
+            os.makedirs(dir_name, exist_ok=True)
+            fd, tmp_path = tempfile.mkstemp(dir=dir_name, suffix=".tmp")
+            try:
+                with os.fdopen(fd, "w", encoding="utf-8", newline="") as f:
+                    writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
+                    writer.writeheader()
+                    writer.writerows(rows)
+                os.replace(tmp_path, csv_path)
+                logger.info(f"Saved label for '{record.source}' by '{record.labeler}'")
+            except Exception:
+                # Clean up temp file on failure
+                if os.path.exists(tmp_path):
+                    os.unlink(tmp_path)
+                raise
+    except Timeout:
+        logger.error(f"Lock timeout for CSV: {csv_path}")
+        raise IOError("Failed to save annotation (file locked). Please try again.")
+    except IOError:
+        raise
+    except Exception as e:
+        logger.error(f"Failed to save label for {record.source}: {e}")
+        raise IOError("Failed to save annotation. Please try again.") from e

data/audio/folder_A/10.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce48e25a4d9cbe9431b0b689b15ed16ea7c5e6e350a5b93fb89343b67f1dc5ac
+size 347244

data/audio/folder_A/11.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:306a6cc6e882a689a93d90ccb1f1e62f7ce07437ab127499518760b45890de53
+size 26284

data/audio/folder_B/12.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc6763766efe21c680ea048e7d1e0e6a51ec61d29fcac1b82c9d0bc18fdaa5d5
+size 45484

data/audio/folder_B/13.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fea62d7359eccc0ec083b7468bf52369a53ead8ffcd74c435915c50e772d2e7
+size 69484

data/audio/references/folder_A.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "10.wav": "some transcription",
+    "11.wav": "another transcription"
+}

data/audio/references/folder_B.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "clip12_001.wav": "12 transcription",
+    "clip13_002.wav": "13 transcription"
+}

docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+version: "3.8"
+services:
+  audio-labeling-tool:
+    build: .
+    ports:
+      - "8501:8501"
+    volumes:
+      - /data/audio:/data/audio:ro
+      - /data/references:/data/references:ro
+      - /data/output:/data/output:rw
+      - ./config.yaml:/app/config.yaml:ro
+    environment:
+      - ALT_CONFIG_PATH=/app/config.yaml
+      - LOG_DIR=/app/logs
+    restart: unless-stopped

models.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""Data models for the Audio Labeling Tool."""
+from dataclasses import dataclass
+from typing import TypedDict
+class LabelRow(TypedDict):
+    """TypedDict representing a single row in the metadata CSV."""
+    source: str  # Audio filename only (e.g., "clip_001.wav")
+    transcription: str  # Corrected transcription text
+    gender: str  # "male" or "female"
+    pii: str  # "True" or "False" (string representation in CSV)
+    labeler: str  # Username of the labeler
+@dataclass
+class LabelRecord:
+    """Dataclass for constructing a label before saving."""
+    source: str
+    transcription: str
+    gender: str
+    pii: bool
+    labeler: str

reference.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Reference JSON loading and validation."""
+import json
+import logging
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def load_reference(json_path: str) -> dict[str, str]:
+    """Load and validate a reference JSON file.
+    Args:
+        json_path: Path to the reference JSON file.
+    Returns:
+        Dictionary mapping audio filenames to transcription strings.
+    Raises:
+        FileNotFoundError: If the JSON file does not exist.
+        ValueError: If the JSON is malformed or not a flat dict[str, str].
+    """
+    path = Path(json_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Reference file not found: {json_path}")
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+    except json.JSONDecodeError:
+        raise ValueError(f"Malformed JSON in {json_path}: unable to parse")
+    if not isinstance(data, dict):
+        raise ValueError(
+            f"Reference JSON must be a flat dict, got {type(data).__name__}"
+        )
+    for key, value in data.items():
+        if not isinstance(key, str) or not isinstance(value, str):
+            raise ValueError("All keys and values in reference JSON must be strings")
+    logger.info(f"Loaded reference JSON: {json_path} ({len(data)} entries)")
+    return data

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+streamlit>=1.28
+filelock>=3.12
+pyyaml>=6.0

resume.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Checkpoint and resume logic."""
+import csv
+import logging
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def build_file_list(audio_folder: str) -> list[str]:
+    """Build an ordered list of WAV filenames from the assigned audio folder.
+    Files are sorted alphabetically.
+    Args:
+        audio_folder: Path to the labeler's audio folder.
+    Returns:
+        Sorted list of WAV filenames.
+    """
+    folder_path = Path(audio_folder)
+    if not folder_path.exists():
+        logger.warning(f"Audio folder not found: {audio_folder}")
+        return []
+    wav_files = sorted(
+        f.name
+        for f in folder_path.iterdir()
+        if f.suffix.lower() == ".wav" and f.is_file()
+    )
+    logger.info(f"Found {len(wav_files)} WAV files in {audio_folder}")
+    return wav_files
+def compute_resume_index(
+    file_list: list[str],
+    csv_path: str,
+    labeler: str,
+) -> int:
+    """Compute the index of the first unlabeled item.
+    Args:
+        file_list: Ordered list of audio filenames.
+        csv_path: Path to the labeler's CSV file.
+        labeler: The labeler's username.
+    Returns:
+        Index of the first unlabeled file, or len(file_list) if all are labeled.
+    """
+    labeled_sources: set[str] = set()
+    path = Path(csv_path)
+    if path.exists():
+        try:
+            with open(path, "r", encoding="utf-8", newline="") as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    if row.get("labeler") == labeler:
+                        labeled_sources.add(row["source"])
+        except Exception as e:
+            logger.warning(f"Could not read CSV for resume: {e}")
+    for i, filename in enumerate(file_list):
+        if filename not in labeled_sources:
+            return i
+    return len(file_list)  # All items labeled

skip_persistence.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""Skip tracking persistence layer."""
+import os
+import csv
+import tempfile
+import logging
+from datetime import datetime
+from pathlib import Path
+from filelock import FileLock, Timeout
+logger = logging.getLogger(__name__)
+SKIP_CSV_COLUMNS = ["labeler", "source", "reason", "timestamp"]
+def save_skip(labeler: str, source: str, reason: str, csv_path: str) -> None:
+    """Save a skip record to the shared skipped_audios.csv.
+    Uses filelock + temp-file-then-rename for atomicity.
+    Args:
+        labeler: Username of the labeler who skipped.
+        source: Audio filename.
+        reason: Reason for skipping.
+        csv_path: Path to the shared skipped_audios.csv.
+    Raises:
+        IOError: If the write operation fails.
+    """
+    lock_path = csv_path + ".lock"
+    lock = FileLock(lock_path, timeout=10)
+    try:
+        with lock:
+            rows: list[dict] = []
+            path = Path(csv_path)
+            if path.exists():
+                with open(path, "r", encoding="utf-8", newline="") as f:
+                    reader = csv.DictReader(f)
+                    rows = [row for row in reader]
+            new_row = {
+                "labeler": labeler,
+                "source": source,
+                "reason": reason,
+                "timestamp": datetime.now().isoformat(),
+            }
+            # Upsert: overwrite if same labeler+source already skipped
+            updated = False
+            for i, row in enumerate(rows):
+                if row["labeler"] == labeler and row["source"] == source:
+                    rows[i] = new_row
+                    updated = True
+                    break
+            if not updated:
+                rows.append(new_row)
+            # Atomic write
+            dir_name = os.path.dirname(csv_path) or "."
+            os.makedirs(dir_name, exist_ok=True)
+            fd, tmp_path = tempfile.mkstemp(dir=dir_name, suffix=".tmp")
+            try:
+                with os.fdopen(fd, "w", encoding="utf-8", newline="") as f:
+                    writer = csv.DictWriter(f, fieldnames=SKIP_CSV_COLUMNS)
+                    writer.writeheader()
+                    writer.writerows(rows)
+                os.replace(tmp_path, csv_path)
+                logger.info(f"Saved skip for '{source}' by '{labeler}': {reason}")
+            except Exception:
+                if os.path.exists(tmp_path):
+                    os.unlink(tmp_path)
+                raise
+    except Timeout:
+        logger.error(f"Lock timeout for skip CSV: {csv_path}")
+        raise IOError("Failed to save skip record. Please try again.")
+    except IOError:
+        raise
+    except Exception as e:
+        logger.error(f"Failed to save skip for {source}: {e}")
+        raise IOError("Failed to save skip record. Please try again.") from e