File size: 2,533 Bytes
cb2428f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import json
import re
from typing import List, Dict, Tuple

def parse_timestamp(timestamp: str) -> Tuple[int, int]:
    """Convert timestamp string like '00:15' to seconds."""
    minutes, seconds = map(int, timestamp.split(':'))
    return minutes * 60 + seconds

def extract_time_range(entry: str) -> Tuple[int, int]:
    """Extract start and end times from an entry like '[00:00 - 00:13]'."""
    match = re.match(r'\[(\d{2}:\d{2}) - (\d{2}:\d{2})\]', entry)
    if not match:
        return None
    start_time = parse_timestamp(match.group(1))
    end_time = parse_timestamp(match.group(2))
    return (start_time, end_time)

def has_overlap(range1: Tuple[int, int], range2: Tuple[int, int]) -> bool:
    """Check if two time ranges overlap."""
    start1, end1 = range1
    start2, end2 = range2
    return not (end1 <= start2 or end2 <= start1)

def clean_transcript(transcript: str) -> str:
    """Clean a single transcript by removing overlapping segments."""
    lines = transcript.split('\n')
    cleaned_lines = []
    time_ranges = []
    
    for line in lines:
        if not line.strip():
            continue
            
        time_range = extract_time_range(line)
        if time_range is None:
            continue
            
        # Check for overlaps with existing ranges
        has_conflict = False
        for existing_range in time_ranges:
            if has_overlap(time_range, existing_range):
                has_conflict = True
                break
                
        if not has_conflict:
            time_ranges.append(time_range)
            cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)

def process_file(input_file: str, output_file: str):
    """Process the JSON file and clean overlapping transcriptions."""
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
    if isinstance(data, dict):
        data = [data]
        
    cleaned_data = []
    for entry in data:
        if 'model_output' in entry:
            entry['model_output'] = clean_transcript(entry['model_output'])
            cleaned_data.append(entry)
            
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

if __name__ == '__main__':
    input_file = 'silence_overlaps/overlap5s_transcriptions.json'
    output_file = 'silence_overlaps/cleaned_transcriptions.json'
    process_file(input_file, output_file)
    print(f"Cleaned transcriptions have been saved to {output_file}")