Student0809's picture
Add files using upload-large-folder tool
cb2428f verified
raw
history blame
2.53 kB
import json
import re
from typing import List, Dict, Tuple
def parse_timestamp(timestamp: str) -> Tuple[int, int]:
"""Convert timestamp string like '00:15' to seconds."""
minutes, seconds = map(int, timestamp.split(':'))
return minutes * 60 + seconds
def extract_time_range(entry: str) -> Tuple[int, int]:
"""Extract start and end times from an entry like '[00:00 - 00:13]'."""
match = re.match(r'\[(\d{2}:\d{2}) - (\d{2}:\d{2})\]', entry)
if not match:
return None
start_time = parse_timestamp(match.group(1))
end_time = parse_timestamp(match.group(2))
return (start_time, end_time)
def has_overlap(range1: Tuple[int, int], range2: Tuple[int, int]) -> bool:
"""Check if two time ranges overlap."""
start1, end1 = range1
start2, end2 = range2
return not (end1 <= start2 or end2 <= start1)
def clean_transcript(transcript: str) -> str:
"""Clean a single transcript by removing overlapping segments."""
lines = transcript.split('\n')
cleaned_lines = []
time_ranges = []
for line in lines:
if not line.strip():
continue
time_range = extract_time_range(line)
if time_range is None:
continue
# Check for overlaps with existing ranges
has_conflict = False
for existing_range in time_ranges:
if has_overlap(time_range, existing_range):
has_conflict = True
break
if not has_conflict:
time_ranges.append(time_range)
cleaned_lines.append(line)
return '\n'.join(cleaned_lines)
def process_file(input_file: str, output_file: str):
"""Process the JSON file and clean overlapping transcriptions."""
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, dict):
data = [data]
cleaned_data = []
for entry in data:
if 'model_output' in entry:
entry['model_output'] = clean_transcript(entry['model_output'])
cleaned_data.append(entry)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
if __name__ == '__main__':
input_file = 'silence_overlaps/overlap5s_transcriptions.json'
output_file = 'silence_overlaps/cleaned_transcriptions.json'
process_file(input_file, output_file)
print(f"Cleaned transcriptions have been saved to {output_file}")