diff --git a/ms-swift/.ipynb_checkpoints/clean_transcripts-checkpoint.py b/ms-swift/.ipynb_checkpoints/clean_transcripts-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..5f47d4272821583d85a7b49cb5dc04dcd8d2f42b --- /dev/null +++ b/ms-swift/.ipynb_checkpoints/clean_transcripts-checkpoint.py @@ -0,0 +1,95 @@ +import json +import re +from typing import List, Dict, Tuple + +def parse_timestamp(timestamp: str) -> Tuple[int, int]: + """Convert timestamp string like '00:15' to seconds.""" + minutes, seconds = map(int, timestamp.split(':')) + return minutes * 60 + seconds + +def extract_time_and_speaker(line: str) -> Tuple[Tuple[int, int], str]: + """Extract time range and speaker from a line.""" + # Extract time range + time_match = re.match(r'\[(\d{2}:\d{2}) - (\d{2}:\d{2})\] (Speaker [A-Z]):', line) + if not time_match: + return None, None + + start_time = parse_timestamp(time_match.group(1)) + end_time = parse_timestamp(time_match.group(2)) + speaker = time_match.group(3) + + return (start_time, end_time), speaker + +def has_overlap(range1: Tuple[int, int], range2: Tuple[int, int]) -> bool: + """Check if two time ranges overlap.""" + start1, end1 = range1 + start2, end2 = range2 + return not (end1 <= start2 or end2 <= start1) + +def has_same_speaker_overlap(transcript: str) -> bool: + """Check if a transcript contains overlapping timestamps for the same speaker.""" + lines = transcript.split('\n') + # Dictionary to store time ranges for each speaker + speaker_ranges = {} + + for line in lines: + if not line.strip(): + continue + + time_range, speaker = extract_time_and_speaker(line) + if time_range is None or speaker is None: + continue + + # Check for overlaps with existing ranges of the same speaker + if speaker in speaker_ranges: + for existing_range in speaker_ranges[speaker]: + if has_overlap(time_range, existing_range): + return True + + speaker_ranges[speaker].append(time_range) + else: + speaker_ranges[speaker] = [time_range] + + return False + +def process_file(input_file: str, output_file: str, delete_file: str): + """Process the JSON file and separate entries with same-speaker overlapping timestamps.""" + with open(input_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + if isinstance(data, dict): + data = [data] + + cleaned_data = [] + deleted_data = [] + removed_count = 0 + + for entry in data: + if 'model_output' in entry: + if not has_same_speaker_overlap(entry['model_output']): + cleaned_data.append(entry) + else: + deleted_data.append(entry) + removed_count += 1 + print(f"Removing entry with key: {entry.get('key', 'unknown')}") + + # Save cleaned data + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(cleaned_data, f, ensure_ascii=False, indent=2) + + # Save deleted data + with open(delete_file, 'w', encoding='utf-8') as f: + json.dump(deleted_data, f, ensure_ascii=False, indent=2) + + print(f"\nProcessing Summary:") + print(f"Processed {len(data)} entries") + print(f"Removed {removed_count} entries with same-speaker overlapping timestamps") + print(f"Remaining entries: {len(cleaned_data)}") + +if __name__ == '__main__': + input_file = 'silence_overlaps/transcriptions.json' + output_file = 'silence_overlaps/cleaned_transcriptions2.json' + delete_file = 'silence_overlaps/delete_transcript2.json' + process_file(input_file, output_file, delete_file) + print(f"\nCleaned transcriptions have been saved to {output_file}") + print(f"Deleted entries have been saved to {delete_file}") \ No newline at end of file diff --git a/ms-swift/.ipynb_checkpoints/dataset_new-checkpoint.json b/ms-swift/.ipynb_checkpoints/dataset_new-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..17ae60ae1e1f5c04f50748be439e8fe235cadd85 --- /dev/null +++ b/ms-swift/.ipynb_checkpoints/dataset_new-checkpoint.json @@ -0,0 +1,93 @@ +{"messages": [{"role": "user", "content": "