Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import requests | |
| import io | |
| import uuid | |
| import os | |
| import json | |
| import base64 | |
| from datetime import datetime | |
| import re | |
| import time | |
| # Set page configuration | |
| st.set_page_config( | |
| page_title="Speech Hate Detection - Annotation Tool", | |
| page_icon="🎧", | |
| layout="centered", | |
| initial_sidebar_state="collapsed" | |
| ) | |
| # Constants | |
| HF_DATASET_URL = "https://huggingface.co/datasets/kcrl/Hs/resolve/main/" | |
| RESULTS_FILE = "annotation_results.csv" # Local CSV file to store results | |
| # Debug flag - enable to see detailed debug info | |
| DEBUG_MODE = True | |
| # Log debugging information if debug mode is enabled | |
| def debug_log(message): | |
| if DEBUG_MODE: | |
| st.write(f"DEBUG: {message}") | |
| # Initial debug message | |
| debug_log("Application starting...") | |
| # For Hugging Face Spaces deployment | |
| if os.path.exists('/data'): | |
| # Use the persistent storage directory | |
| RESULTS_FILE = "/data/annotation_results.csv" | |
| debug_log(f"Using persistent storage at {RESULTS_FILE}") | |
| # Function to check if file exists in the Hugging Face repository with exponential backoff | |
| def check_file_exists(file_url, max_retries=3): | |
| """ | |
| Checks if a file exists at the given URL without downloading the entire file. | |
| Uses exponential backoff for retries. | |
| Returns True if the file exists, False otherwise. | |
| """ | |
| for attempt in range(max_retries): | |
| try: | |
| # Use a short timeout to avoid long waits | |
| response = requests.head(file_url, timeout=3) | |
| return response.status_code == 200 | |
| except Exception as e: | |
| if attempt < max_retries - 1: | |
| # Exponential backoff: 1s, 2s, 4s, etc. | |
| wait_time = 2 ** attempt | |
| debug_log(f"Request failed, retrying in {wait_time}s: {str(e)}") | |
| time.sleep(wait_time) | |
| else: | |
| debug_log(f"Request failed after {max_retries} attempts: {str(e)}") | |
| return False | |
| return False | |
| # Function to check if a specific chunk exists | |
| def check_chunk_exists(video_id, chunk_num): | |
| """Check if a specific chunk of a video exists in the repository""" | |
| chunk_id = f"{chunk_num:04d}" | |
| file_name = f"{video_id}_chunk_{chunk_id}.wav" | |
| file_url = f"{HF_DATASET_URL}{file_name}" | |
| return check_file_exists(file_url) | |
| # Function to find all chunks for a video by using binary search approach | |
| def find_all_chunks_for_video(video_id, max_possible_chunks=500): | |
| """ | |
| Find all available chunks for a video ID using an optimized approach. | |
| Uses binary search first to find the approximate range, then checks each file. | |
| Args: | |
| video_id: The video ID to check | |
| max_possible_chunks: Upper limit for the binary search | |
| Returns: | |
| List of chunk numbers that exist | |
| """ | |
| debug_log(f"Finding chunks for {video_id}...") | |
| # First use binary search to find the upper bound | |
| low = 1 | |
| high = max_possible_chunks | |
| # Find an upper bound first (where files no longer exist) | |
| while low <= high: | |
| mid = (low + high) // 2 | |
| if check_chunk_exists(video_id, mid): | |
| low = mid + 1 | |
| else: | |
| high = mid - 1 | |
| # The highest existing chunk is at 'high' | |
| highest_chunk = max(1, high) | |
| debug_log(f"Binary search found highest chunk: {highest_chunk}") | |
| # Now check each potential chunk from 1 to highest_chunk | |
| existing_chunks = [] | |
| for chunk_num in range(1, highest_chunk + 1): | |
| # Add some throttling to avoid rate limits (0.1s between requests) | |
| time.sleep(0.1) | |
| if check_chunk_exists(video_id, chunk_num): | |
| existing_chunks.append(chunk_num) | |
| debug_log(f"Found {len(existing_chunks)} chunks for {video_id}") | |
| return existing_chunks | |
| # Function to build a list of audio file paths from video IDs with dynamic chunk detection | |
| def build_file_list_from_video_ids(video_ids, check_existence=False): | |
| """ | |
| Creates a list of audio files based on the provided video IDs. | |
| Dynamically detects how many chunks exist for each video. | |
| Args: | |
| video_ids: List of video IDs | |
| check_existence: Whether to verify each file exists before adding it | |
| Returns: | |
| List of dictionaries with file info | |
| """ | |
| files = [] | |
| debug_log(f"Building file list for {len(video_ids)} videos (check_existence={check_existence})...") | |
| # Create progress bar for checking videos | |
| progress_bar = st.progress(0) | |
| for i, video_id in enumerate(video_ids): | |
| # Update progress | |
| progress_bar.progress((i + 1) / len(video_ids)) | |
| if check_existence: | |
| # Find all chunks for this video | |
| st.write(f"Finding chunks for video {video_id} ({i+1}/{len(video_ids)})...") | |
| chunks = find_all_chunks_for_video(video_id) | |
| if chunks: | |
| st.write(f"Found {len(chunks)} chunks for video {video_id}") | |
| for chunk_num in chunks: | |
| chunk_id = f"{chunk_num:04d}" | |
| file_id = f"{video_id}_chunk_{chunk_id}" | |
| file_name = f"{file_id}.wav" | |
| file_url = f"{HF_DATASET_URL}{file_name}" | |
| files.append({ | |
| "id": file_id, | |
| "name": file_name, | |
| "url": file_url, | |
| "video_id": video_id, | |
| "chunk_num": chunk_num | |
| }) | |
| else: | |
| st.warning(f"No chunks found for video {video_id}") | |
| else: | |
| # If not checking existence, use a default range of chunks (1-100) | |
| # Reduced from 1-200 to speed up initial loading | |
| for chunk_num in range(1, 101): | |
| chunk_id = f"{chunk_num:04d}" | |
| file_id = f"{video_id}_chunk_{chunk_id}" | |
| file_name = f"{file_id}.wav" | |
| file_url = f"{HF_DATASET_URL}{file_name}" | |
| files.append({ | |
| "id": file_id, | |
| "name": file_name, | |
| "url": file_url, | |
| "video_id": video_id, | |
| "chunk_num": chunk_num | |
| }) | |
| debug_log(f"Built file list with {len(files)} total files") | |
| return files | |
| # Function to download file from Hugging Face with retry logic | |
| def download_file_from_hf(file_url, max_retries=3): | |
| for attempt in range(max_retries): | |
| try: | |
| response = requests.get(file_url, timeout=10) # Increased timeout for audio downloads | |
| if response.status_code == 200: | |
| return response.content | |
| else: | |
| if attempt < max_retries - 1: | |
| wait_time = 2 ** attempt | |
| debug_log(f"Download failed (HTTP {response.status_code}), retrying in {wait_time}s") | |
| time.sleep(wait_time) | |
| else: | |
| st.error(f"Failed to download file: HTTP {response.status_code}") | |
| return None | |
| except Exception as e: | |
| if attempt < max_retries - 1: | |
| wait_time = 2 ** attempt | |
| debug_log(f"Download error, retrying in {wait_time}s: {str(e)}") | |
| time.sleep(wait_time) | |
| else: | |
| st.error(f"Error downloading file: {e}") | |
| return None | |
| return None | |
| # Create a unique ID for new annotators or retrieve existing | |
| def get_annotator_id(): | |
| debug_log("Getting annotator ID...") | |
| if 'annotator_id' not in st.session_state: | |
| # Check if we have a stored ID in local storage | |
| annotator_id_file = '.annotator_id' | |
| if os.path.exists('/data'): | |
| annotator_id_file = '/data/.annotator_id' | |
| if os.path.exists(annotator_id_file): | |
| with open(annotator_id_file, 'r') as f: | |
| st.session_state.annotator_id = f.read().strip() | |
| debug_log(f"Retrieved existing annotator ID") | |
| else: | |
| # Generate a new ID | |
| st.session_state.annotator_id = str(uuid.uuid4()) | |
| with open(annotator_id_file, 'w') as f: | |
| f.write(st.session_state.annotator_id) | |
| debug_log(f"Created new annotator ID") | |
| return st.session_state.annotator_id | |
| # Function to load annotation data from CSV | |
| def load_annotations(): | |
| debug_log(f"Loading annotations from {RESULTS_FILE}") | |
| try: | |
| if os.path.exists(RESULTS_FILE): | |
| df = pd.read_csv(RESULTS_FILE) | |
| debug_log(f"Loaded {len(df)} annotation records") | |
| return df | |
| else: | |
| # Create a new DataFrame if the file doesn't exist | |
| debug_log("No existing annotations found, creating new file") | |
| df = pd.DataFrame(columns=['file_id', 'file_name', 'Label', 'annotator_id', 'timestamp', 'video_id']) | |
| df.to_csv(RESULTS_FILE, index=False) | |
| return df | |
| except Exception as e: | |
| st.error(f"Error loading annotations: {e}") | |
| debug_log(f"Error loading annotations: {str(e)}") | |
| return pd.DataFrame(columns=['file_id', 'file_name', 'Label', 'annotator_id', 'timestamp', 'video_id']) | |
| # Function to save annotations to CSV | |
| def save_annotation(df): | |
| debug_log(f"Saving annotations to {RESULTS_FILE}") | |
| try: | |
| df.to_csv(RESULTS_FILE, index=False) | |
| debug_log("Annotations saved successfully") | |
| return True | |
| except Exception as e: | |
| st.error(f"Error saving annotation: {e}") | |
| debug_log(f"Error saving annotations: {str(e)}") | |
| return False | |
| # Initialize application state | |
| if 'initialized' not in st.session_state: | |
| debug_log("Initializing application state") | |
| st.session_state.initialized = False | |
| st.session_state.current_file_index = 0 | |
| st.session_state.current_file = None | |
| st.session_state.annotation_df = None | |
| st.session_state.all_files = [] | |
| st.session_state.pending_files = [] | |
| st.session_state.hate_count = 0 | |
| st.session_state.non_hate_count = 0 | |
| st.session_state.discard_count = 0 | |
| st.session_state.page = 1 | |
| st.session_state.files_per_page = 50 | |
| st.session_state.lite_mode = False | |
| # Application title and header | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 26px; | |
| font-weight: bold; | |
| color: #ff4b4b; | |
| margin-bottom: 20px; | |
| } | |
| .sub-header { | |
| font-size: 18px; | |
| color: #555; | |
| margin-bottom: 30px; | |
| } | |
| .progress-container { | |
| margin: 20px 0; | |
| padding: 15px; | |
| background-color: #f9f9f9; | |
| border-radius: 5px; | |
| } | |
| .stats-container { | |
| display: flex; | |
| justify-content: space-around; | |
| margin-top: 20px; | |
| text-align: center; | |
| flex-wrap: wrap; | |
| } | |
| .stat-item { | |
| padding: 10px; | |
| min-width: 100px; | |
| } | |
| .stat-value { | |
| font-size: 24px; | |
| font-weight: bold; | |
| color: #4CAF50; | |
| } | |
| .stat-label { | |
| font-size: 14px; | |
| color: #666; | |
| } | |
| .audio-container { | |
| margin: 30px 0; | |
| padding: 20px; | |
| background-color: #f5f5f5; | |
| border-radius: 10px; | |
| text-align: center; | |
| } | |
| .file-info { | |
| font-size: 14px; | |
| color: #666; | |
| margin-top: 5px; | |
| } | |
| </style> | |
| <div class="main-header">Speech Hate Detection - Annotation Tool</div> | |
| """, unsafe_allow_html=True) | |
| # Quick start in lite mode (new feature) | |
| if not st.session_state.initialized: | |
| if st.button("⚡ Quick Start (Lite Mode)"): | |
| debug_log("Starting in lite mode") | |
| st.session_state.lite_mode = True | |
| st.session_state.annotation_df = load_annotations() | |
| st.session_state.initialized = True | |
| st.success("Started in lite mode. Enter video IDs and click Initialize.") | |
| st.rerun() | |
| # App configuration section (collapsible) | |
| with st.expander("Configuration", expanded=not st.session_state.initialized): | |
| st.markdown(""" | |
| ### Configuration | |
| This tool loads audio files from the Hugging Face dataset at: | |
| https://huggingface.co/datasets/kcrl/Hs | |
| You can provide a list of video IDs for annotation by adding them in the text area below. | |
| """) | |
| # Default video IDs | |
| default_video_ids = "0hJ2JGhM7TY\n1PRABBSTpiE\n4ewRgBMP_AY" # Reduced to just 3 for initial testing | |
| # Allow user to input video IDs | |
| user_video_ids = st.text_area( | |
| "Video IDs to annotate (one per line)", | |
| value=default_video_ids, | |
| height=150, | |
| help="Enter the YouTube video IDs, one per line. The app will look for chunks of these videos." | |
| ) | |
| annotator_name = st.text_input("Your Name (Optional)", | |
| help="Your name for tracking purposes") | |
| # Set default to False to speed initial loading | |
| check_files = st.checkbox("Check if files exist (slower but more accurate)", value=False, | |
| help="Verifies each file exists before adding it to the list") | |
| only_new_files = st.checkbox("Only show new files (not previously annotated)", value=True, | |
| help="Skip files that have already been annotated") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("Initialize Application"): | |
| debug_log("Initialize button clicked") | |
| # Get annotator ID | |
| annotator_id = get_annotator_id() | |
| # First check if we have any video IDs | |
| if not user_video_ids.strip(): | |
| st.error("Please enter at least one video ID to annotate") | |
| else: | |
| # Split by line and remove empty lines | |
| video_ids = [vid.strip() for vid in user_video_ids.split('\n') if vid.strip()] | |
| if not video_ids: | |
| st.error("Please enter at least one valid video ID") | |
| else: | |
| # Load all audio files based on the video IDs | |
| with st.spinner(f"Building file list for {len(video_ids)} videos..."): | |
| all_files = build_file_list_from_video_ids( | |
| video_ids, | |
| check_existence=check_files | |
| ) | |
| if not all_files: | |
| st.error("No audio files found. Please check the video IDs and try again.") | |
| else: | |
| st.session_state.all_files = all_files | |
| # Load existing annotation CSV | |
| annotation_df = load_annotations() | |
| st.session_state.annotation_df = annotation_df | |
| # Filter out files that have already been annotated by this annotator | |
| annotated_files = set() | |
| if not annotation_df.empty: | |
| if only_new_files: | |
| # If only showing new files, consider files annotated by any annotator | |
| annotated_files = set(annotation_df['file_id'].tolist()) | |
| else: | |
| # Otherwise, only consider files annotated by this specific annotator | |
| annotated_files = set(annotation_df[annotation_df['annotator_id'] == annotator_id]['file_id'].tolist()) | |
| # Count existing annotations by this annotator | |
| hate_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) & | |
| (annotation_df['Label'] == 'Hate')]) | |
| non_hate_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) & | |
| (annotation_df['Label'] == 'Non-Hate')]) | |
| discard_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) & | |
| (annotation_df['Label'] == 'Discard')]) | |
| st.session_state.hate_count = hate_count | |
| st.session_state.non_hate_count = non_hate_count | |
| st.session_state.discard_count = discard_count | |
| # Create list of pending files (not yet annotated) | |
| pending_files = [f for f in all_files if f['id'] not in annotated_files] | |
| st.session_state.pending_files = pending_files | |
| if pending_files: | |
| st.session_state.current_file = pending_files[0] | |
| st.session_state.initialized = True | |
| st.success(f"Application initialized successfully! Found {len(pending_files)} files to annotate.") | |
| st.rerun() | |
| else: | |
| st.warning("All files have already been annotated. Try adding new video IDs or uncheck 'Only show new files'.") | |
| with col2: | |
| if st.button("Reset Application State"): | |
| # Clear the session state | |
| for key in list(st.session_state.keys()): | |
| del st.session_state[key] | |
| st.success("Application state has been reset. You can start fresh.") | |
| st.rerun() | |
| # Main annotation interface | |
| if st.session_state.initialized and st.session_state.pending_files: | |
| debug_log("Rendering main annotation interface") | |
| # Display current annotator | |
| st.markdown(f""" | |
| <div class="sub-header"> | |
| Annotator: {annotator_name if annotator_name else st.session_state.annotator_id} | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Display progress | |
| total_files = len(st.session_state.all_files) | |
| annotated_files = total_files - len(st.session_state.pending_files) | |
| progress_percentage = int((annotated_files / total_files) * 100) if total_files > 0 else 0 | |
| st.markdown(f""" | |
| <div class="progress-container"> | |
| <div>Progress: {annotated_files}/{total_files} samples annotated ({progress_percentage}%)</div> | |
| <div style="margin-top: 10px; height: 10px; background-color: #eee; border-radius: 5px;"> | |
| <div style="height: 100%; width: {progress_percentage}%; background-color: #4CAF50; border-radius: 5px;"></div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Display statistics | |
| st.markdown(f""" | |
| <div class="stats-container"> | |
| <div class="stat-item"> | |
| <div class="stat-value">{len(st.session_state.all_files)}</div> | |
| <div class="stat-label">Total Files</div> | |
| </div> | |
| <div class="stat-item"> | |
| <div class="stat-value">{annotated_files}</div> | |
| <div class="stat-label">Completed</div> | |
| </div> | |
| <div class="stat-item"> | |
| <div class="stat-value">{len(st.session_state.pending_files)}</div> | |
| <div class="stat-label">Remaining</div> | |
| </div> | |
| <div class="stat-item"> | |
| <div class="stat-value">{st.session_state.hate_count}</div> | |
| <div class="stat-label">Hate</div> | |
| </div> | |
| <div class="stat-item"> | |
| <div class="stat-value">{st.session_state.non_hate_count}</div> | |
| <div class="stat-label">Non-Hate</div> | |
| </div> | |
| <div class="stat-item"> | |
| <div class="stat-value">{st.session_state.discard_count}</div> | |
| <div class="stat-label">Discard</div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Audio player section | |
| current_file = st.session_state.current_file | |
| # Get video ID from the file data | |
| video_id = current_file.get('video_id', "Unknown") | |
| if video_id == "Unknown" and "_chunk_" in current_file['name']: | |
| # Extract from filename as fallback | |
| video_id = current_file['name'].split("_chunk_")[0] | |
| st.markdown(f""" | |
| <div class="audio-container"> | |
| <div style="font-weight: bold; margin-bottom: 15px;">Currently Playing: {current_file['name']}</div> | |
| <div class="file-info">Video ID: {video_id}</div> | |
| """, unsafe_allow_html=True) | |
| # Get the audio file | |
| if 'url' in current_file: | |
| debug_log(f"Attempting to download audio from {current_file['url']}") | |
| with st.spinner("Loading audio file..."): | |
| audio_bytes = download_file_from_hf(current_file['url']) | |
| else: | |
| # Fallback for old format | |
| fallback_url = f"{HF_DATASET_URL}{current_file['name']}" | |
| debug_log(f"Attempting to download audio from fallback URL {fallback_url}") | |
| with st.spinner("Loading audio file..."): | |
| audio_bytes = download_file_from_hf(fallback_url) | |
| if audio_bytes: | |
| debug_log("Audio file downloaded successfully") | |
| # Display audio player | |
| st.audio(audio_bytes, format='audio/wav') | |
| # Annotation controls | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| annotation = st.selectbox( | |
| "Select classification:", | |
| ["-- Select --", "Hate", "Non-Hate", "Discard"], | |
| index=0, | |
| help="Select 'Discard' for unclear audio, background noise, or non-relevant content" | |
| ) | |
| with col2: | |
| st.write("") | |
| st.write("") | |
| if st.button("Skip File"): | |
| debug_log("Skip file button clicked") | |
| # Remove the current file from pending | |
| st.session_state.pending_files.pop(0) | |
| # Load the next file if available | |
| if st.session_state.pending_files: | |
| st.session_state.current_file = st.session_state.pending_files[0] | |
| st.rerun() | |
| else: | |
| st.success("All files have been processed!") | |
| if st.button("Submit & Load Next Sample", type="primary"): | |
| if annotation == "-- Select --": | |
| st.warning("Please select a classification before submitting.") | |
| else: | |
| debug_log(f"Submitting annotation: {annotation}") | |
| # Record the annotation | |
| new_row = { | |
| 'file_id': current_file['id'], | |
| 'file_name': current_file['name'], | |
| 'Label': annotation, | |
| 'annotator_id': st.session_state.annotator_id, | |
| 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| 'video_id': video_id | |
| } | |
| # Update the DataFrame | |
| st.session_state.annotation_df = pd.concat([ | |
| st.session_state.annotation_df, | |
| pd.DataFrame([new_row]) | |
| ], ignore_index=True) | |
| # Update counts | |
| if annotation == "Hate": | |
| st.session_state.hate_count += 1 | |
| elif annotation == "Non-Hate": | |
| st.session_state.non_hate_count += 1 | |
| else: # Discard | |
| st.session_state.discard_count += 1 | |
| # Save the updated annotations | |
| success = save_annotation(st.session_state.annotation_df) | |
| if success: | |
| debug_log("Annotation saved successfully") | |
| # Remove the current file from pending | |
| st.session_state.pending_files.pop(0) | |
| # Prefetch next file if available (new optimization) | |
| if len(st.session_state.pending_files) > 0: | |
| debug_log("Prefetching next file in background") | |
| # We'll just set the next file, actual prefetching would require threading | |
| # Load the next file if available | |
| if st.session_state.pending_files: | |
| st.session_state.current_file = st.session_state.pending_files[0] | |
| st.rerun() | |
| else: | |
| st.success("All files have been annotated! Great job!") | |
| else: | |
| st.error("Failed to save annotation. Please try again.") | |
| else: | |
| debug_log(f"Failed to load audio file: {current_file['name']}") | |
| st.error(f"Failed to load audio file: {current_file['name']}. The file may not exist in the repository.") | |
| # Skip button for files that can't be loaded | |
| if st.button("Skip This File", type="primary"): | |
| debug_log("Skipping unloadable file") | |
| # Remove the current file from pending | |
| st.session_state.pending_files.pop(0) | |
| # Load the next file if available | |
| if st.session_state.pending_files: | |
| st.session_state.current_file = st.session_state.pending_files[0] | |
| st.rerun() | |
| else: | |
| st.success("All files have been processed!") | |
| elif st.session_state.initialized and not st.session_state.pending_files: | |
| debug_log("All files annotated, showing summary") | |
| st.success("All files have been annotated! Thank you for your contribution!") | |
| # Show summary statistics | |
| st.markdown(f""" | |
| <div class="stats-container"> | |
| <div class="stat-item"> | |
| <div class="stat-value">{len(st.session_state.all_files)}</div> | |
| <div class="stat-label">Total Files</div> | |
| </div> | |
| <div class="stat-item"> | |
| <div class="stat-value">{st.session_state.hate_count}</div> | |
| <div class="stat-label">Hate</div> | |
| </div> | |
| <div class="stat-item"> | |
| <div class="stat-value">{st.session_state.non_hate_count}</div> | |
| <div class="stat-label">Non-Hate</div> | |
| </div> | |
| <div class="stat-item"> | |
| <div class="stat-value">{st.session_state.discard_count}</div> | |
| <div class="stat-label">Discard</div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Option to download the results | |
| if not st.session_state.annotation_df.empty: | |
| csv = st.session_state.annotation_df.to_csv(index=False) | |
| b64 = base64.b64encode(csv.encode()).decode() | |
| href = f'<a href="data:file/csv;base64,{b64}" download="annotation_results.csv">Download Results CSV</a>' | |
| st.markdown(href, unsafe_allow_html=True) | |
| # Two columns for buttons | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("Reset and Start Over"): | |
| debug_log("Reset and start over clicked") | |
| st.session_state.clear() | |
| st.rerun() | |
| with col2: | |
| if st.button("Add More Videos"): | |
| debug_log("Add more videos clicked") | |
| # Keep the annotation data but reset the initialization | |
| st.session_state.initialized = False | |
| st.rerun() | |
| else: | |
| debug_log("Showing initial configuration screen") | |
| st.info("Please configure and initialize the application using the Configuration section above.") | |
| # Example video IDs | |
| st.markdown(""" | |
| ### Example Video IDs | |
| You can use the following format in the Video IDs text area: | |
| ``` | |
| 0hJ2JGhM7TY | |
| 1PRABBSTpiE | |
| 4ewRgBMP_AY | |
| ``` | |
| The app will look for files like: | |
| - 0hJ2JGhM7TY_chunk_0001.wav | |
| - 0hJ2JGhM7TY_chunk_0002.wav | |
| - 1PRABBSTpiE_chunk_0001.wav | |
| - etc. | |
| """) | |
| # Add a footer with instructions | |
| st.markdown(""" | |
| --- | |
| ### Instructions: | |
| 1. Enter video IDs in the configuration section | |
| 2. Set your name (optional) and click "Initialize Application" to start | |
| 3. Listen to each audio sample | |
| 4. Select the appropriate classification: | |
| - **Hate**: Contains hate speech | |
| - **Non-Hate**: Does not contain hate speech | |
| - **Discard**: Poor audio quality, background noise, or irrelevant content | |
| 5. Click "Submit & Load Next Sample" to continue | |
| 6. Your progress is saved automatically | |
| 7. When all samples are annotated, you can download the results | |
| ### Adding New Data | |
| When you add new data to the Hugging Face dataset: | |
| 1. Click "Add More Videos" after completing current annotations | |
| 2. Enter the new video IDs in the configuration | |
| 3. Make sure "Only show new files" is checked | |
| 4. Initialize the application again | |
| This will only present files that haven't been annotated yet. | |
| ### Dataset Information | |
| The audio files are sourced from the Hugging Face dataset: | |
| [kcrl/Hs](https://huggingface.co/datasets/kcrl/Hs) | |
| File naming follows the pattern: `[VIDEO_ID]_chunk_[CHUNK_NUMBER].wav` | |
| Example: `0hJ2JGhM7TY_chunk_0001.wav` | |
| """) |