import json import re from typing import List, Dict, Tuple def parse_timestamp(timestamp: str) -> Tuple[int, int]: """Convert timestamp string like '00:15' to seconds.""" minutes, seconds = map(int, timestamp.split(':')) return minutes * 60 + seconds def extract_time_range(entry: str) -> Tuple[int, int]: """Extract start and end times from an entry like '[00:00 - 00:13]'.""" match = re.match(r'\[(\d{2}:\d{2}) - (\d{2}:\d{2})\]', entry) if not match: return None start_time = parse_timestamp(match.group(1)) end_time = parse_timestamp(match.group(2)) return (start_time, end_time) def has_overlap(range1: Tuple[int, int], range2: Tuple[int, int]) -> bool: """Check if two time ranges overlap.""" start1, end1 = range1 start2, end2 = range2 return not (end1 <= start2 or end2 <= start1) def clean_transcript(transcript: str) -> str: """Clean a single transcript by removing overlapping segments.""" lines = transcript.split('\n') cleaned_lines = [] time_ranges = [] for line in lines: if not line.strip(): continue time_range = extract_time_range(line) if time_range is None: continue # Check for overlaps with existing ranges has_conflict = False for existing_range in time_ranges: if has_overlap(time_range, existing_range): has_conflict = True break if not has_conflict: time_ranges.append(time_range) cleaned_lines.append(line) return '\n'.join(cleaned_lines) def process_file(input_file: str, output_file: str): """Process the JSON file and clean overlapping transcriptions.""" with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) if isinstance(data, dict): data = [data] cleaned_data = [] for entry in data: if 'model_output' in entry: entry['model_output'] = clean_transcript(entry['model_output']) cleaned_data.append(entry) with open(output_file, 'w', encoding='utf-8') as f: json.dump(cleaned_data, f, ensure_ascii=False, indent=2) if __name__ == '__main__': input_file = 'silence_overlaps/overlap5s_transcriptions.json' output_file = 'silence_overlaps/cleaned_transcriptions.json' process_file(input_file, output_file) print(f"Cleaned transcriptions have been saved to {output_file}")