File size: 2,945 Bytes
b6a70f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import json
import random

def seconds_to_mmss(seconds):
    minutes = int(seconds // 60)
    seconds = int(seconds % 60)
    return f"{minutes:02d}:{seconds:02d}"

# Templates for speaker segment descriptions
SPEAKER_TEMPLATES = [
    "Speaker {speaker} speaks during the following periods: {times}",
    "Speaker {speaker}'s speaking segments occur at: {times}",
    "Speaker {speaker} is active in the conversation at: {times}",
    "The following time segments belong to Speaker {speaker}: {times}",
    "Speaker {speaker} participates in the dialogue at: {times}",
    "Speaker {speaker} contributes to the conversation during: {times}",
    "Speaking turns for Speaker {speaker} are at: {times}",
    "Speaker {speaker} takes the floor at: {times}",
    "The voice of Speaker {speaker} is heard at: {times}",
    "Speaker {speaker} engages in the discussion during: {times}"
]
file = "silence"
def process_speaker_segments():
    # Read the overlap_5s_716.json file
    with open(f'{file}.json', 'r', encoding='utf-8') as f:
        data = json.load(f)

    # List to store results for all conversations
    results = []

    # Process each conversation
    for conversation_id, conversation in data.items():
        segments = conversation.get('segments', [])
        audio_path = conversation.get('stereo_audio', [])
        # Dictionary to store speaking times for each speaker
        speaker_times = {}
        
        # Process each segment
        for segment in segments:
            speaker = segment['speaker']
            start_time = segment['start_time']  # Keep as float for accurate conversion
            end_time = segment['end_time']      # Keep as float for accurate conversion
            
            # Initialize list for this speaker if not exists
            if speaker not in speaker_times:
                speaker_times[speaker] = []
            
            # Add this speaking interval
            speaker_times[speaker].append((start_time, end_time))
        
        # Format the output string
        output_lines = []
        for speaker in sorted(speaker_times.keys()):
            times = speaker_times[speaker]
            time_ranges = [f"{seconds_to_mmss(start)}-{seconds_to_mmss(end)}" for start, end in times]
            # Randomly select a template for each speaker
            template = random.choice(SPEAKER_TEMPLATES)
            output_lines.append(template.format(speaker=speaker, times=', '.join(time_ranges)))
        
        # Create result entry
        result = {
            "key": conversation_id,
            "audio_url": audio_path,
            "model_output": "\n".join(output_lines)
        }
        results.append(result)

    # Save the results to a JSON file
    output_file = f'{file}_speaker.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

if __name__ == "__main__":
    process_speaker_segments()