Student0809 commited on Jul 24, 2025

Commit

3438cdb

verified ·

1 Parent(s): b6a70f8

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.ipynb_checkpoints/COT_TRAIN-checkpoint.jsonl +0 -0
.ipynb_checkpoints/GRPO_TRAIN-checkpoint.jsonl +0 -0
.ipynb_checkpoints/test-checkpoint.sh +6 -0
4JOB/.ipynb_checkpoints/filter-checkpoint.py +132 -0
4JOB/.ipynb_checkpoints/process_silence-checkpoint.py +84 -0
4JOB/.ipynb_checkpoints/process_speaker-checkpoint.py +74 -0
4JOB/.ipynb_checkpoints/process_transcription-checkpoint.py +80 -0
4JOB/filter_logs/.ipynb_checkpoints/removed_entries_20250618_162013-checkpoint.log +72 -0
4JOB/filter_logs/removed_entries_20250618_162013.log +72 -0
4JOB/filter_logs/removed_entries_20250618_162341.log +92 -0
4JOB/overlap/.ipynb_checkpoints/mergeAll-checkpoint.py +44 -0
4JOB/overlap/mergeAll.py +44 -0
4JOB/overlap/trimmed_dialogues_pause_0_200_output.json +0 -0
4JOB/overlap/trimmed_dialogues_pause_600_800_output.json +0 -0
4JOB/overlap_filtered_output/trimmed_dialogues_pause_0_200_output.json +0 -0
4JOB/overlap_filtered_output/trimmed_dialogues_pause_200_400_output.json +0 -0
4JOB/overlap_filtered_output/trimmed_dialogues_pause_600_800_output.json +0 -0
4JOB/silence/mergeAll.py +44 -0
4JOB/silence/trimmed_dialogues_pause_100_200_output.json +0 -0
4JOB/silenceOringal.json +0 -0
4JOB/train/overlap_speaker.json +0 -0
GRPO/Reward.py +87 -0
cotSFT/gemini-text/.ipynb_checkpoints/texterror_results-checkpoint.json +0 -0
cotSFT/gemini-text/texterror_results.json +0 -0
cotSFT/train/.ipynb_checkpoints/correctresults_with_audio-checkpoint.json +0 -0
cotSFT/train/correctresults_with_audio.json +0 -0
cotSFT_new/.ipynb_checkpoints/correct_output_transcription-checkpoint.json +0 -0
cotSFT_new/.ipynb_checkpoints/delay_output-checkpoint.json +0 -0
cotSFT_new/.ipynb_checkpoints/gemini2.5_metainfo-checkpoint.py +317 -0
cotSFT_new/.ipynb_checkpoints/overlaps1_output-checkpoint.json +0 -0
cotSFT_new/.ipynb_checkpoints/process_transcription-checkpoint.py +80 -0
cotSFT_new/cotSFT_10data/.ipynb_checkpoints/dataset_real_sft-checkpoint.jsonl +0 -0
cotSFT_new/cotSFT_10data/dataset_real_sft.jsonl +0 -0
cotSFT_new/cotSFT_10data/gemini2.5_metainfo.py +329 -0
cotSFT_new/cotSFT_gemini.json +0 -0
cotSFT_new/delay_output.json +0 -0
cotSFT_new/filtered_output/.ipynb_checkpoints/delay_output-checkpoint.json +0 -0
cotSFT_new/filtered_output/.ipynb_checkpoints/process_transcription-checkpoint.py +80 -0
cotSFT_new/filtered_output/.ipynb_checkpoints/texterror_output_transcription_gemini-checkpoint.json +0 -0
cotSFT_new/filtered_output/alltrain/.ipynb_checkpoints/correct_output_transcription_merged_output_990-checkpoint.json +0 -0
cotSFT_new/filtered_output/alltrain/correct_output_transcription_merged_output_990.json +0 -0
cotSFT_new/filtered_output/alltrain/overlaps1_gemini_merged_output.json +0 -0
cotSFT_new/filtered_output/alltrain/texterror_output_transcription_merged_output.json +0 -0
cotSFT_new/filtered_output/correc/.ipynb_checkpoints/correct_output_transcription_gemini_error-checkpoint.json +1 -0
cotSFT_new/filtered_output/correc/correct_output_transcription.json +0 -0
cotSFT_new/filtered_output/correc/correct_output_transcription_gemini_chunk2.json +0 -0
cotSFT_new/filtered_output/correc/correct_output_transcription_gemini_chunk3.json +0 -0
cotSFT_new/filtered_output/correc/correct_output_transcription_gemini_chunk4.json +0 -0
cotSFT_new/filtered_output/correc/correct_output_transcription_gemini_chunk6.json +0 -0
cotSFT_new/filtered_output/correc/correct_output_transcription_gemini_chunk7.json +0 -0

.ipynb_checkpoints/COT_TRAIN-checkpoint.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

.ipynb_checkpoints/GRPO_TRAIN-checkpoint.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

.ipynb_checkpoints/test-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters /root/autodl-tmp/output_7B_SFT/v0-20250605-155458/checkpoint-1095 \
+    --stream true \
+    --temperature 0 \
+    --max_new_tokens 2048

4JOB/.ipynb_checkpoints/filter-checkpoint.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import json
+import os
+from datetime import datetime
+def filter_by_duration(input_file, output_file, min_duration=30, max_duration=90):
+    """
+    过滤JSON文件，只保留total_duration在[min_duration, max_duration]范围内的条目
+    并记录被删除的文件信息到日志文件
+    :param input_file: 输入JSON文件路径
+    :param output_file: 输出JSON文件路径
+    :param min_duration: 最小持续时间（秒）
+    :param max_duration: 最大持续时间（秒）
+    """
+    # 创建日志目录
+    log_dir = os.path.join(os.path.dirname(output_file), "filter_logs")
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+    # 创建日志文件（以当前时间命名）
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_file = os.path.join(log_dir, f"removed_entries_{timestamp}.log")
+    # 加载原始JSON文件
+    with open(input_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    # 初始化过滤结果和删除列表
+    filtered_data = {}
+    removed_entries = []
+    # 过滤数据并记录被删除的条目
+    for key, value in data.items():
+        if 'total_duration' in value and min_duration <= value['total_duration'] <= max_duration:
+            filtered_data[key] = value
+        else:
+            duration = value.get('total_duration', 'N/A')
+            removed_entries.append({
+                'key': key,
+                'duration': duration,
+                'original_dialog_id': value.get('original_dialog_id', 'N/A'),
+                'reason': 'too_short' if isinstance(duration, (int, float)) and duration < min_duration
+                         else 'too_long' if isinstance(duration, (int, float)) and duration > max_duration
+                         else 'missing_or_invalid'
+            })
+    # 保存过滤后的结果
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(filtered_data, f, indent=2, ensure_ascii=False)
+    # 保存删除日志
+    with open(log_file, 'w', encoding='utf-8') as f:
+        f.write(f"Filtering log - {timestamp}\n")
+        f.write(f"Input file: {input_file}\n")
+        f.write(f"Output file: {output_file}\n")
+        f.write(f"Duration range: {min_duration}s to {max_duration}s\n\n")
+        f.write("Removed Entries:\n")
+        f.write("="*50 + "\n")
+        for entry in removed_entries:
+            f.write(f"Key: {entry['key']}\n")
+            f.write(f"Original Dialog ID: {entry['original_dialog_id']}\n")
+            f.write(f"Duration: {entry['duration']}s\n")
+            f.write(f"Reason: {entry['reason']}\n")
+            f.write("-"*50 + "\n")
+    print(f"\n处理结果: {os.path.basename(input_file)}")
+    print(f"原始条目数: {len(data)}")
+    print(f"过滤后条目数: {len(filtered_data)}")
+    print(f"已删除 {len(removed_entries)} 个不符合时长要求的条目")
+    print(f"过滤后的数据已保存到: {output_file}")
+    print(f"删除条目日志已保存到: {log_file}")
+def process_directory(input_dir, output_dir, min_duration=30, max_duration=90):
+    """
+    处理目录中的所有JSON文件
+    """
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # 创建总日志文件
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    summary_log = os.path.join(output_dir, f"summary_removed_entries_{timestamp}.log")
+    total_removed = 0
+    total_processed = 0
+    with open(summary_log, 'w', encoding='utf-8') as summary_f:
+        summary_f.write(f"Summary Filtering Log - {timestamp}\n")
+        summary_f.write(f"Input directory: {input_dir}\n")
+        summary_f.write(f"Output directory: {output_dir}\n")
+        summary_f.write(f"Duration range: {min_duration}s to {max_duration}s\n\n")
+        for filename in os.listdir(input_dir):
+            if filename.endswith('.json'):
+                input_path = os.path.join(input_dir, filename)
+                output_path = os.path.join(output_dir, filename)
+                print(f"\n处理文件: {filename}")
+                filter_by_duration(input_path, output_path, min_duration, max_duration)
+                # 读取单个文件日志以获取统计信息
+                log_dir = os.path.join(output_dir, "filter_logs")
+                latest_log = max(
+                    [f for f in os.listdir(log_dir) if f.startswith('removed_entries')],
+                    key=lambda f: os.path.getmtime(os.path.join(log_dir, f)))
+                with open(os.path.join(log_dir, latest_log), 'r', encoding='utf-8') as log_f:
+                    log_content = log_f.read()
+                    removed_count = log_content.count("Key: ")
+                summary_f.write(f"\nFile: {filename}\n")
+                summary_f.write(f"Removed entries: {removed_count}\n")
+                summary_f.write("-"*40 + "\n")
+                total_removed += removed_count
+                total_processed += 1
+        summary_f.write(f"\nTotal files processed: {total_processed}\n")
+        summary_f.write(f"Total entries removed: {total_removed}\n")
+    print(f"\n处理完成！所有文件的总日志已保存到: {summary_log}")
+if __name__ == "__main__":
+    # 使用示例 - 处理单个文件
+    input_json = "silence.json"  # 替换为你的输入文件路径
+    output_json = "silence_filtered_output.json"  # 输出文件路径
+    filter_by_duration(input_json, output_json)
+    # 使用示例 - 处理整个目录
+    # input_directory = "./input_4JOB_overlap"  # 替换为你的输入目录
+    # output_directory = "./filtered_output"  # 替换为你的输出目录
+    # process_directory(input_directory, output_directory)

4JOB/.ipynb_checkpoints/process_silence-checkpoint.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import json
+import os
+import random
+def seconds_to_mmss(seconds):
+    minutes = int(seconds // 60)
+    seconds = int(seconds % 60)
+    return f"{minutes:02d}:{seconds:02d}"
+# Templates for silence gap descriptions
+SILENCE_TEMPLATES = [
+    "Silence gaps longer than 3 seconds occur at: {gaps}",
+    "The conversation contains significant pauses at: {gaps}",
+    "There are silent periods of more than 3 seconds at: {gaps}",
+    "The dialogue features extended pauses at: {gaps}",
+    "Silent intervals exceeding 3 seconds are found at: {gaps}",
+    "The conversation includes notable gaps at: {gaps}",
+    "Extended periods of silence occur at: {gaps}",
+    "The dialogue has significant breaks at: {gaps}",
+    "Silent segments longer than 3 seconds appear at: {gaps}",
+    "The conversation shows substantial pauses at: {gaps}"
+]
+# Templates for no silence case
+NO_SILENCE_TEMPLATES = [
+    "No silence gaps longer than 3 seconds were found in this conversation.",
+    "The conversation flows continuously without significant pauses.",
+    "No extended periods of silence were detected in this dialogue.",
+    "The conversation maintains a steady pace without notable gaps.",
+    "No silent intervals exceeding 3 seconds were identified.",
+    "The dialogue proceeds without substantial pauses.",
+    "No significant breaks in conversation were observed.",
+    "The conversation shows no extended silent periods.",
+    "No notable gaps in speech were detected.",
+    "The dialogue continues without significant silent intervals."
+]
+file = "silence"
+def process_silence_gaps():
+    # Read the overlap_5s_716.json file
+    with open(f'{file}.json', 'r', encoding='utf-8') as f:
+        silence_data = json.load(f)
+    # List to store results for all conversations
+    results = []
+    # Process each conversation
+    for conversation_id, conversation in silence_data.items():
+        segments = conversation.get('segments', [])
+        audio_path = conversation.get('stereo_audio', [])
+        silence_gaps = []
+        # Find silence gaps > 3s between segments
+        for i in range(len(segments) - 1):
+            current_end = segments[i]['end_time']
+            next_start = segments[i + 1]['start_time']
+            gap_duration = next_start - current_end
+            if gap_duration > 3:
+                silence_gaps.append(f"{seconds_to_mmss(current_end)}-{seconds_to_mmss(next_start)}")
+        # Create result entry with random template
+        if silence_gaps:
+            template = random.choice(SILENCE_TEMPLATES)
+            model_output = template.format(gaps=', '.join(silence_gaps))
+        else:
+            model_output = random.choice(NO_SILENCE_TEMPLATES)
+        result = {
+            "key": conversation_id,
+            "audio_url": audio_path,
+            "model_output": model_output
+        }
+        results.append(result)
+    # Save the results to a JSON file
+    output_file = f'{file}_silencegap.json'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    print(f"Processed {len(results)} conversations")
+    print(f"Results written to {output_file}")
+if __name__ == "__main__":
+    process_silence_gaps()

4JOB/.ipynb_checkpoints/process_speaker-checkpoint.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import json
+import random
+def seconds_to_mmss(seconds):
+    minutes = int(seconds // 60)
+    seconds = int(seconds % 60)
+    return f"{minutes:02d}:{seconds:02d}"
+# Templates for speaker segment descriptions
+SPEAKER_TEMPLATES = [
+    "Speaker {speaker} speaks during the following periods: {times}",
+    "Speaker {speaker}'s speaking segments occur at: {times}",
+    "Speaker {speaker} is active in the conversation at: {times}",
+    "The following time segments belong to Speaker {speaker}: {times}",
+    "Speaker {speaker} participates in the dialogue at: {times}",
+    "Speaker {speaker} contributes to the conversation during: {times}",
+    "Speaking turns for Speaker {speaker} are at: {times}",
+    "Speaker {speaker} takes the floor at: {times}",
+    "The voice of Speaker {speaker} is heard at: {times}",
+    "Speaker {speaker} engages in the discussion during: {times}"
+]
+file = "silence"
+def process_speaker_segments():
+    # Read the overlap_5s_716.json file
+    with open(f'{file}.json', 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    # List to store results for all conversations
+    results = []
+    # Process each conversation
+    for conversation_id, conversation in data.items():
+        segments = conversation.get('segments', [])
+        audio_path = conversation.get('stereo_audio', [])
+        # Dictionary to store speaking times for each speaker
+        speaker_times = {}
+        # Process each segment
+        for segment in segments:
+            speaker = segment['speaker']
+            start_time = segment['start_time']  # Keep as float for accurate conversion
+            end_time = segment['end_time']      # Keep as float for accurate conversion
+            # Initialize list for this speaker if not exists
+            if speaker not in speaker_times:
+                speaker_times[speaker] = []
+            # Add this speaking interval
+            speaker_times[speaker].append((start_time, end_time))
+        # Format the output string
+        output_lines = []
+        for speaker in sorted(speaker_times.keys()):
+            times = speaker_times[speaker]
+            time_ranges = [f"{seconds_to_mmss(start)}-{seconds_to_mmss(end)}" for start, end in times]
+            # Randomly select a template for each speaker
+            template = random.choice(SPEAKER_TEMPLATES)
+            output_lines.append(template.format(speaker=speaker, times=', '.join(time_ranges)))
+        # Create result entry
+        result = {
+            "key": conversation_id,
+            "audio_url": audio_path,
+            "model_output": "\n".join(output_lines)
+        }
+        results.append(result)
+    # Save the results to a JSON file
+    output_file = f'{file}_speaker.json'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+if __name__ == "__main__":
+    process_speaker_segments()

4JOB/.ipynb_checkpoints/process_transcription-checkpoint.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import json
+def seconds_to_mmss(seconds):
+    minutes = int(seconds // 60)
+    seconds = int(seconds % 60)
+    return f"{minutes:02d}:{seconds:02d}"
+filename = "silence"
+def is_overlapping(current_segment, other_segments):
+    """Check if the current segment overlaps with any other segment."""
+    current_start = current_segment['start_time']
+    current_end = current_segment['end_time']
+    for segment in other_segments:
+        if segment == current_segment:
+            continue
+        other_start = segment['start_time']
+        other_end = segment['end_time']
+        # Check if there's an overlap
+        if (current_start < other_end and current_end > other_start):
+            return True
+    return False
+def process_transcriptions():
+    # Read the overlap_5s_716.json file
+    with open(f'./{filename}.json', 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    # List to store results for all conversations
+    results = []
+    # Process each conversation
+    for conversation_id, conversation in data.items():
+        segments = conversation.get('segments', [])
+        audio_path = conversation.get('stereo_audio', [])
+        # Sort segments by start time
+        segments.sort(key=lambda x: x['start_time'])
+        # Process each segment and create transcription lines
+        transcription_lines = []
+        for segment in segments:
+            speaker = segment['speaker']
+            start_time = segment['start_time']
+            end_time = segment['end_time']
+            text = segment['text']
+            original_text = segment['original_text']
+            original_text = original_text.replace("[interrupt] ", "").strip()
+            # Format timestamp
+            timestamp = f"[{seconds_to_mmss(start_time)} - {seconds_to_mmss(end_time)}]"
+            # Check if this segment overlaps with any other segment
+            has_overlap = is_overlapping(segment, segments)
+            # Format the line
+            if has_overlap:
+                line = f"{timestamp} Speaker {speaker}: {original_text}"
+            else:
+                line = f"{timestamp} Speaker {speaker}: {text}"
+            transcription_lines.append(line)
+        # Create result entry
+        result = {
+            "key": conversation_id,
+            "audio_url": audio_path,
+            "model_output": "\n".join(transcription_lines)
+        }
+        results.append(result)
+    # Save the results to a JSON file
+    output_file = f'./{filename}_transcription.json'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+if __name__ == "__main__":
+    process_transcriptions()

4JOB/filter_logs/.ipynb_checkpoints/removed_entries_20250618_162013-checkpoint.log ADDED Viewed

	@@ -0,0 +1,72 @@

+Filtering log - 20250618_162013
+Input file: overlap.json
+Output file: overlap_filtered_output.json
+Duration range: 30s to 90s
+Removed Entries:
+==================================================
+Key: 165
+Original Dialog ID: DialogSum--train--713
+Duration: 94.65241666666667s
+Reason: too_long
+--------------------------------------------------
+Key: 131
+Original Dialog ID: DialogSum--train--674
+Duration: 91.20929166666667s
+Reason: too_long
+--------------------------------------------------
+Key: 185_1
+Original Dialog ID: DialogSum--train--988
+Duration: 90.738625s
+Reason: too_long
+--------------------------------------------------
+Key: 63_1
+Original Dialog ID: DialogSum--train--837
+Duration: 93.28445833333333s
+Reason: too_long
+--------------------------------------------------
+Key: 74
+Original Dialog ID: DialogSum--train--850
+Duration: 90.21241666666667s
+Reason: too_long
+--------------------------------------------------
+Key: 129_2
+Original Dialog ID: DialogSum--train--153
+Duration: 29.000666666666667s
+Reason: too_short
+--------------------------------------------------
+Key: 174_1
+Original Dialog ID: DialogSum--train--215
+Duration: 92.936125s
+Reason: too_long
+--------------------------------------------------
+Key: 119_2
+Original Dialog ID: DialogSum--train--142
+Duration: 91.69558333333333s
+Reason: too_long
+--------------------------------------------------
+Key: 25_2
+Original Dialog ID: DialogSum--train--29
+Duration: 93.67675s
+Reason: too_long
+--------------------------------------------------
+Key: 34_2
+Original Dialog ID: DialogSum--train--40
+Duration: 91.93370833333333s
+Reason: too_long
+--------------------------------------------------
+Key: 21_3
+Original Dialog ID: DialogSum--train--278
+Duration: 29.887916666666666s
+Reason: too_short
+--------------------------------------------------
+Key: 39_2
+Original Dialog ID: DialogSum--train--300
+Duration: 27.758333333333333s
+Reason: too_short
+--------------------------------------------------
+Key: 146_3
+Original Dialog ID: DialogSum--train--439
+Duration: 97.25554166666667s
+Reason: too_long
+--------------------------------------------------

4JOB/filter_logs/removed_entries_20250618_162013.log ADDED Viewed

	@@ -0,0 +1,72 @@

+Filtering log - 20250618_162013
+Input file: overlap.json
+Output file: overlap_filtered_output.json
+Duration range: 30s to 90s
+Removed Entries:
+==================================================
+Key: 165
+Original Dialog ID: DialogSum--train--713
+Duration: 94.65241666666667s
+Reason: too_long
+--------------------------------------------------
+Key: 131
+Original Dialog ID: DialogSum--train--674
+Duration: 91.20929166666667s
+Reason: too_long
+--------------------------------------------------
+Key: 185_1
+Original Dialog ID: DialogSum--train--988
+Duration: 90.738625s
+Reason: too_long
+--------------------------------------------------
+Key: 63_1
+Original Dialog ID: DialogSum--train--837
+Duration: 93.28445833333333s
+Reason: too_long
+--------------------------------------------------
+Key: 74
+Original Dialog ID: DialogSum--train--850
+Duration: 90.21241666666667s
+Reason: too_long
+--------------------------------------------------
+Key: 129_2
+Original Dialog ID: DialogSum--train--153
+Duration: 29.000666666666667s
+Reason: too_short
+--------------------------------------------------
+Key: 174_1
+Original Dialog ID: DialogSum--train--215
+Duration: 92.936125s
+Reason: too_long
+--------------------------------------------------
+Key: 119_2
+Original Dialog ID: DialogSum--train--142
+Duration: 91.69558333333333s
+Reason: too_long
+--------------------------------------------------
+Key: 25_2
+Original Dialog ID: DialogSum--train--29
+Duration: 93.67675s
+Reason: too_long
+--------------------------------------------------
+Key: 34_2
+Original Dialog ID: DialogSum--train--40
+Duration: 91.93370833333333s
+Reason: too_long
+--------------------------------------------------
+Key: 21_3
+Original Dialog ID: DialogSum--train--278
+Duration: 29.887916666666666s
+Reason: too_short
+--------------------------------------------------
+Key: 39_2
+Original Dialog ID: DialogSum--train--300
+Duration: 27.758333333333333s
+Reason: too_short
+--------------------------------------------------
+Key: 146_3
+Original Dialog ID: DialogSum--train--439
+Duration: 97.25554166666667s
+Reason: too_long
+--------------------------------------------------

4JOB/filter_logs/removed_entries_20250618_162341.log ADDED Viewed

	@@ -0,0 +1,92 @@

+Filtering log - 20250618_162341
+Input file: silence.json
+Output file: silence_filtered_output.json
+Duration range: 30s to 90s
+Removed Entries:
+==================================================
+Key: 83
+Original Dialog ID: SODA_PROCESSED--train--214477
+Duration: 99.83525s
+Reason: too_long
+--------------------------------------------------
+Key: 15
+Original Dialog ID: SODA_PROCESSED--train--972977
+Duration: 94.94791666666667s
+Reason: too_long
+--------------------------------------------------
+Key: 18
+Original Dialog ID: SODA_PROCESSED--train--795181
+Duration: 92.62829166666667s
+Reason: too_long
+--------------------------------------------------
+Key: 31_1
+Original Dialog ID: SODA_PROCESSED--train--1113674
+Duration: 95.01416666666667s
+Reason: too_long
+--------------------------------------------------
+Key: 53_1
+Original Dialog ID: SODA_PROCESSED--train--484021
+Duration: 98.07645833333333s
+Reason: too_long
+--------------------------------------------------
+Key: 74_1
+Original Dialog ID: SODA_PROCESSED--train--1047480
+Duration: 91.74375s
+Reason: too_long
+--------------------------------------------------
+Key: 17_1
+Original Dialog ID: SODA_PROCESSED--train--166191
+Duration: 97.76666666666667s
+Reason: too_long
+--------------------------------------------------
+Key: 46_2
+Original Dialog ID: SODA_PROCESSED--train--727552
+Duration: 91.58875s
+Reason: too_long
+--------------------------------------------------
+Key: 84_2
+Original Dialog ID: SODA_PROCESSED--train--286623
+Duration: 94.22970833333333s
+Reason: too_long
+--------------------------------------------------
+Key: 55_2
+Original Dialog ID: SODA_PROCESSED--train--317784
+Duration: 96.18079166666666s
+Reason: too_long
+--------------------------------------------------
+Key: 35_2
+Original Dialog ID: SODA_PROCESSED--train--1190867
+Duration: 99.861s
+Reason: too_long
+--------------------------------------------------
+Key: 99_2
+Original Dialog ID: SODA_PROCESSED--train--304811
+Duration: 91.12975s
+Reason: too_long
+--------------------------------------------------
+Key: 44_3
+Original Dialog ID: SODA_PROCESSED--train--1084179
+Duration: 90.02725s
+Reason: too_long
+--------------------------------------------------
+Key: 24_3
+Original Dialog ID: SODA_PROCESSED--train--209436
+Duration: 94.00129166666666s
+Reason: too_long
+--------------------------------------------------
+Key: 10_3
+Original Dialog ID: SODA_PROCESSED--train--606362
+Duration: 95.01458333333333s
+Reason: too_long
+--------------------------------------------------
+Key: 11_3
+Original Dialog ID: SODA_PROCESSED--train--33760
+Duration: 91.81675s
+Reason: too_long
+--------------------------------------------------
+Key: 73_4
+Original Dialog ID: SODA_PROCESSED--train--873625
+Duration: 92.01975s
+Reason: too_long
+--------------------------------------------------

4JOB/overlap/.ipynb_checkpoints/mergeAll-checkpoint.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import json
+def load_json(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+def get_unique_key(base_key, existing_keys):
+    """在已有 key 中查找唯一 key，例如 key, key_1, key_2..."""
+    if base_key not in existing_keys:
+        return base_key
+    i = 1
+    while f"{base_key}_{i}" in existing_keys:
+        i += 1
+    return f"{base_key}_{i}"
+def merge_all_jsons_in_folder(folder_path='.', output_path="merged_all_unique.json"):
+    merged_data = {}
+    for filename in os.listdir(folder_path):
+        if filename.endswith(".json"):
+            file_path = os.path.join(folder_path, filename)
+            try:
+                data = load_json(file_path)
+                if not isinstance(data, dict):
+                    print(f"{filename} 不是一个字典，跳过。")
+                    continue
+                for key, value in data.items():
+                    unique_key = get_unique_key(key, merged_data)
+                    if unique_key != key:
+                        print(f"键 '{key}' 重复，已改为 '{unique_key}' 来合并。")
+                    merged_data[unique_key] = value
+            except Exception as e:
+                print(f"加载 {filename} 出错：{e}")
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(merged_data, f, ensure_ascii=False, indent=2)
+    print(f"\n合并完成，共 {len(merged_data)} 条记录写入 {output_path}")
+if __name__ == "__main__":
+    merge_all_jsons_in_folder(folder_path='.', output_path="merged_all_unique.json")

4JOB/overlap/mergeAll.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import json
+def load_json(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+def get_unique_key(base_key, existing_keys):
+    """在已有 key 中查找唯一 key，例如 key, key_1, key_2..."""
+    if base_key not in existing_keys:
+        return base_key
+    i = 1
+    while f"{base_key}_{i}" in existing_keys:
+        i += 1
+    return f"{base_key}_{i}"
+def merge_all_jsons_in_folder(folder_path='.', output_path="merged_all_unique.json"):
+    merged_data = {}
+    for filename in os.listdir(folder_path):
+        if filename.endswith(".json"):
+            file_path = os.path.join(folder_path, filename)
+            try:
+                data = load_json(file_path)
+                if not isinstance(data, dict):
+                    print(f"{filename} 不是一个字典，跳过。")
+                    continue
+                for key, value in data.items():
+                    unique_key = get_unique_key(key, merged_data)
+                    if unique_key != key:
+                        print(f"键 '{key}' 重复，已改为 '{unique_key}' 来合并。")
+                    merged_data[unique_key] = value
+            except Exception as e:
+                print(f"加载 {filename} 出错：{e}")
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(merged_data, f, ensure_ascii=False, indent=2)
+    print(f"\n合并完成，共 {len(merged_data)} 条记录写入 {output_path}")
+if __name__ == "__main__":
+    merge_all_jsons_in_folder(folder_path='.', output_path="merged_all_unique.json")

4JOB/overlap/trimmed_dialogues_pause_0_200_output.json ADDED Viewed

The diff for this file is too large to render. See raw diff

4JOB/overlap/trimmed_dialogues_pause_600_800_output.json ADDED Viewed

The diff for this file is too large to render. See raw diff

4JOB/overlap_filtered_output/trimmed_dialogues_pause_0_200_output.json ADDED Viewed

The diff for this file is too large to render. See raw diff

4JOB/overlap_filtered_output/trimmed_dialogues_pause_200_400_output.json ADDED Viewed

The diff for this file is too large to render. See raw diff

4JOB/overlap_filtered_output/trimmed_dialogues_pause_600_800_output.json ADDED Viewed

The diff for this file is too large to render. See raw diff

4JOB/silence/mergeAll.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import json
+def load_json(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+def get_unique_key(base_key, existing_keys):
+    """在已有 key 中查找唯一 key，例如 key, key_1, key_2..."""
+    if base_key not in existing_keys:
+        return base_key
+    i = 1
+    while f"{base_key}_{i}" in existing_keys:
+        i += 1
+    return f"{base_key}_{i}"
+def merge_all_jsons_in_folder(folder_path='.', output_path="merged_all_unique.json"):
+    merged_data = {}
+    for filename in os.listdir(folder_path):
+        if filename.endswith(".json"):
+            file_path = os.path.join(folder_path, filename)
+            try:
+                data = load_json(file_path)
+                if not isinstance(data, dict):
+                    print(f"{filename} 不是一个字典，跳过。")
+                    continue
+                for key, value in data.items():
+                    unique_key = get_unique_key(key, merged_data)
+                    if unique_key != key:
+                        print(f"键 '{key}' 重复，已改为 '{unique_key}' 来合并。")
+                    merged_data[unique_key] = value
+            except Exception as e:
+                print(f"加载 {filename} 出错：{e}")
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(merged_data, f, ensure_ascii=False, indent=2)
+    print(f"\n合并完成，共 {len(merged_data)} 条记录写入 {output_path}")
+if __name__ == "__main__":
+    merge_all_jsons_in_folder(folder_path='.', output_path="merged_all_unique.json")

4JOB/silence/trimmed_dialogues_pause_100_200_output.json ADDED Viewed

The diff for this file is too large to render. See raw diff

4JOB/silenceOringal.json ADDED Viewed

The diff for this file is too large to render. See raw diff

4JOB/train/overlap_speaker.json ADDED Viewed

The diff for this file is too large to render. See raw diff

GRPO/Reward.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+import re
+import math
+import json
+from datetime import datetime
+from swift.plugin import ORM,orms
+from typing import Dict, List, Union
+class MultiModalAccuracyORM(ORM):
+    def __call__(self, completions, solution, **kwargs) -> List[float]:
+        """
+        Reward function that checks if the completion is correct.
+        Args:
+            completions (list[str]): Generated outputs
+            solution (list[str]): Ground Truths.
+        Returns:
+            list[float]: Reward scores
+        """
+        rewards = []
+        #completion_contents = [completion[0]["content"] for completion in completions]
+        for content, gt_score_orig in zip(completions, solution):
+            score_match = re.search(r"<overall score>(\d+)</overall score>", content)
+            #score_match = re.search(r"<score>(\d+)</score>", content)
+            pred_score = None
+            gt_score = None
+            # breakpoint()
+            # print(content)
+            # print(score_match)
+            if score_match:
+                try:
+                    pred_score = int(score_match.group(1))
+                    if not (1 <= pred_score <= 2):
+                        pred_score = None
+                except:
+                    pass
+            try:
+                gt_score = int(gt_score_orig[0])
+                if not (1 <= gt_score <= 2):
+                    gt_score = None
+            except:
+                pass
+            # 分段奖励逻辑
+            if pred_score is not None and gt_score is not None:
+                if pred_score == gt_score:
+                    reward = 5.0
+                elif abs(pred_score - gt_score) <= 1:
+                    reward = 1.0
+                else:
+                    reward = 0.0
+            else:
+                reward = 0.0
+            rewards.append(reward)
+        return rewards
+class MultiModalFormatAccuracyORM(ORM):
+    def __call__(self, completions, **kwargs) -> List[float]:
+        """Reward function that checks if the completion has a specific format."""
+        rewards = []
+        response_pattern = r"<response think>.*?</response think>"
+        react_pattern = r"<fluency think>.*?</fluency think>"
+        score_pattern = r"[*\s]*<overall score>(\d+)</overall score>[\s*]*"
+        #completion_contents = [completion[0]["content"] for completion in completions]
+        for content in completions:
+            # breakpoint()
+            # print(content)
+            has_response = bool(re.search(response_pattern, content, re.DOTALL))
+            #print(has_response)
+            has_react = bool(re.search(react_pattern, content, re.DOTALL))
+            #print(has_react)
+            has_score = bool(re.search(score_pattern, content, re.DOTALL))
+            #print(has_score)
+            if has_response and has_react and has_score:
+                rewards.append(5.0)
+            # elif has_score and (has_response or has_react):
+            #     rewards.append(3.0)
+            # elif has_response or has_react:
+            #     rewards.append(1.0)
+            else:
+                rewards.append(0)
+        return rewards
+orms['external_r1v_format_acc'] = MultiModalFormatAccuracyORM
+orms['external_r1v_acc'] = MultiModalAccuracyORM

cotSFT/gemini-text/.ipynb_checkpoints/texterror_results-checkpoint.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT/gemini-text/texterror_results.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT/train/.ipynb_checkpoints/correctresults_with_audio-checkpoint.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT/train/correctresults_with_audio.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/.ipynb_checkpoints/correct_output_transcription-checkpoint.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/.ipynb_checkpoints/delay_output-checkpoint.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/.ipynb_checkpoints/gemini2.5_metainfo-checkpoint.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import os
+import json
+import re
+import requests
+from tqdm import tqdm
+from datetime import datetime
+import glob
+from requests.exceptions import Timeout
+import argparse
+prompt_template = (
+    "# Interactional Dialogue Evaluation\n\n"
+    "**IMPORTANT**: Evaluation must include `<response think>` and `<fluency think>` analysis and `<overall score>` rating.\n"
+    "Evaluate the quality of the interaction in the given dialogue transcript, focusing on:\n"
+    "**Response Relevance:** \n"
+    "**logical consistency, topic coherence**\n"
+    "**Interactional Fluency:**\n"
+    "**Detect and evaluate extended overlaps in conversation.**\n"
+    "**Detect and evaluate long pauses between speaker turns.\n\n**"
+    "**Note**: Small pauses and brief overlaps in conversation are acceptable, while prolonged pauses and overlapping turns are harmful. You should consider Response Relevance and Interactional Fluency separately, and provide the corresponding thinking process.\n\n"
+    "## Scoring Criteria\n"
+    "Assign a single holistic score based on the combined evaluation:\n"
+    "`1` (Poor): Significant issues in either  **Response Relevance ** or  **Interactional Fluency. **\n"
+    "`2` (Excellent): Both **Response Relevance ** and  **Interactional Fluency ** are consistently appropriate and natural.\n"
+    "## Evaluation Output Format:\n"
+    "Strictly follow this template:\n"
+    "<response think>\n"
+    "[Analysing Response Relevance and giving reasons for scoring...]\n"
+    "</response think>\n"
+    "<fluency think>\n"
+    "[Analysing Interactional Fluency and giving reasons for scoring.]\n"
+    "</fluency think>\n"
+    "<overall score>X</overall score>\n"
+)
+# API configuration
+url = "https://api2.aigcbest.top/v1/chat/completions"
+headers = {
+    "Authorization": "Bearer sk-yAIqUaGzzVNSesHq4mRPaCbt53MMFRJIMB97cS4FkRy6idwN",
+    "Content-Type": "application/json",
+    "Accept": "application/json"
+}
+def parse_args():
+    parser = argparse.ArgumentParser(description='Process text evaluation with Gemini model')
+    parser.add_argument('--input_file', type=str, required=True,
+                      help='Input JSON file containing text data')
+    parser.add_argument('--output_file', type=str, default='texterror_gemini.json',
+                      help='Output JSON file for results')
+    parser.add_argument('--error_file', type=str, default='texterror_gemini_error.json',
+                      help='Output JSON file for errors')
+    parser.add_argument('--checkpoint_dir', type=str, default='checkpoints_test_text',
+                      help='Directory for storing checkpoints')
+    parser.add_argument('--max_retries', type=int, default=3,
+                      help='Maximum number of retries for failed predictions')
+    parser.add_argument('--checkpoint_interval', type=int, default=20,
+                      help='Number of items to process before saving checkpoint')
+    return parser.parse_args()
+def extract_overall_score(output_str):
+    """Extract <overall score>X</overall score> from model output."""
+    score_pattern = r"<overall score>(\d+)</overall score>"
+    match = re.search(score_pattern, output_str)
+    if match:
+        try:
+            return int(match.group(1))
+        except ValueError:
+            pass
+    return None
+def validate_model_output(output_str):
+    """Validate that the model output contains all required tags"""
+    required_tags = [
+        "<response think>",
+        "</response think>",
+        "<fluency think>",
+        "</fluency think>",
+        "<overall score>",
+        "</overall score>"
+    ]
+    for tag in required_tags:
+        if tag not in output_str:
+            return False
+    return True
+def extract_tag_content(output_str, tag_name):
+    """Extract content between opening and closing tags"""
+    start_tag = f"<{tag_name}>"
+    end_tag = f"</{tag_name}>"
+    try:
+        start_idx = output_str.find(start_tag) + len(start_tag)
+        end_idx = output_str.find(end_tag)
+        if start_idx == -1 or end_idx == -1:
+            return None
+        return output_str[start_idx:end_idx].strip()
+    except:
+        return None
+def format_model_output(output_str):
+    """Extract and format content from all required tags"""
+    response_content = extract_tag_content(output_str, "response think")
+    fluency_content = extract_tag_content(output_str, "fluency think")
+    score_content = extract_tag_content(output_str, "overall score")
+    if not all([response_content, fluency_content, score_content]):
+        return None
+    formatted_output = (
+        f"<response think>\n{response_content}\n</response think>\n\n"
+        f"<fluency think>\n{fluency_content}\n</fluency think>\n\n"
+        f"<overall score>{score_content}</overall score>"
+    )
+    return formatted_output
+def make_api_call(text_input, retry_count=0, max_retries=5):
+    """Make API call with retry logic for API errors"""
+    try:
+        print(f"Attempting API call (attempt {retry_count + 1}/{max_retries + 1})")
+        data_req = {
+            "model": "gemini-2.5-flash-preview-05-20-thinking",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt_template
+                        },
+                        {
+                            "type": "text",
+                            "text": text_input
+                        },
+                    ]
+                }
+            ],
+            "temperature": 1,
+        }
+        response = requests.post(url, headers=headers, json=data_req, timeout=(200, 200))
+        print(f"API response received with status code: {response.status_code}")
+        if response.status_code == 200:
+            model_output = response.json()['choices'][0]['message']['content']
+            if not validate_model_output(model_output):
+                print("Model output missing required tags, retrying...")
+                return None, None
+            formatted_output = format_model_output(model_output)
+            if formatted_output is None:
+                print("Failed to extract content from tags, retrying...")
+                return None, None
+            pred_score = extract_overall_score(model_output)
+            return formatted_output, pred_score
+        else:
+            print(f"API returned error status {response.status_code}: {response.text}")
+            if retry_count >= max_retries:
+                raise Exception(f"POST error {response.status_code}: {response.text}")
+            return None, None
+    except requests.exceptions.ConnectTimeout:
+        print(f"Connection timeout (>10s)")
+        if retry_count >= max_retries:
+            raise Exception("Connection timeout")
+        return None, None
+    except requests.exceptions.ReadTimeout:
+        print(f"Read timeout (>30s)")
+        if retry_count >= max_retries:
+            raise Exception("Read timeout")
+        return None, None
+    except Exception as e:
+        print(f"Unexpected error during API call: {str(e)}")
+        if retry_count >= max_retries:
+            raise e
+        return None, None
+def get_latest_checkpoint(checkpoint_dir):
+    """Get the latest checkpoint file and its processed count"""
+    checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "checkpoint_*.json"))
+    if not checkpoint_files:
+        return None, 0
+    latest_checkpoint = None
+    max_count = 0
+    for checkpoint in checkpoint_files:
+        try:
+            count = int(os.path.basename(checkpoint).split('_')[1])
+            if count > max_count:
+                max_count = count
+                latest_checkpoint = checkpoint
+        except (ValueError, IndexError):
+            continue
+    return latest_checkpoint, max_count
+def save_checkpoint(results, processed_count, checkpoint_dir):
+    """Save results to a checkpoint file"""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    checkpoint_file = os.path.join(checkpoint_dir, f"checkpoint_{processed_count}_{timestamp}.json")
+    with open(checkpoint_file, "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    print(f"Checkpoint saved: {checkpoint_file}")
+def main():
+    args = parse_args()
+    # Initialize results storage
+    results = []
+    save_file_name = args.output_file
+    error_file_name = args.error_file
+    # Create checkpoints directory
+    checkpoint_dir = args.checkpoint_dir
+    if not os.path.exists(checkpoint_dir):
+        os.makedirs(checkpoint_dir)
+    # Load test data
+    all_data_file = args.input_file
+    with open(all_data_file, 'r', encoding='utf-8') as f:
+        all_data = json.load(f)
+    # Initialize error tracking
+    error_results = []
+    # Load checkpoint if exists
+    latest_checkpoint, checkpoint_count = get_latest_checkpoint(checkpoint_dir)
+    if latest_checkpoint:
+        print(f"Found latest checkpoint with {checkpoint_count} processed items: {latest_checkpoint}")
+        try:
+            with open(latest_checkpoint, 'r', encoding='utf-8') as f:
+                results = json.load(f)
+                print(f"Resumed from checkpoint: processed {len(results)} items")
+        except Exception as e:
+            print(f"Warning: Failed to load checkpoint {latest_checkpoint}: {e}")
+            results = []
+    else:
+        print("No checkpoint found, starting from scratch")
+        results = []
+    max_prediction_retries = args.max_retries
+    total_count = 0
+    for item in tqdm(all_data, desc="Processing texts"):
+        key = item.get('key')
+        text_input = item.get('model_output')
+        if not text_input:
+            print(f"No text input found for key {key}, skipping...")
+            continue
+        print(f"Processing text for key={key}")
+        prediction_retry_count = 0
+        success = False
+        while prediction_retry_count < max_prediction_retries and not success:
+            try:
+                print(f"\nProcessing attempt {prediction_retry_count + 1}")
+                model_output, pred_score = make_api_call(text_input)
+                if model_output is None or pred_score is None:
+                    print("API call failed, retrying...")
+                    prediction_retry_count += 1
+                    continue
+                print(f"Received prediction: {pred_score}")
+                if pred_score == 1:
+                    success = True
+                    print("Prediction score is 1, accepting result")
+                else:
+                    prediction_retry_count += 1
+                    print(f"Prediction score is not 1 (attempt {prediction_retry_count}/{max_prediction_retries})")
+                    if prediction_retry_count >= max_prediction_retries:
+                        print("Max retries reached, accepting last prediction")
+                        success = True
+                    else:
+                        continue
+                results.append({
+                    "key": key,
+                    "text_input": text_input,
+                    "model_output": model_output,
+                    "predicted_score": pred_score,
+                    "prediction_attempts": prediction_retry_count + 1
+                })
+                with open(save_file_name, "w", encoding="utf-8") as f:
+                    json.dump(results, f, indent=2, ensure_ascii=False)
+                total_count += 1
+                if total_count % args.checkpoint_interval == 0:
+                    save_checkpoint(results, total_count, checkpoint_dir)
+            except Exception as e:
+                error_msg = str(e)
+                print(f"Failed to process text for key {key}: {error_msg}")
+                error_results.append({
+                    "key": key,
+                    "text_input": text_input,
+                    "error": f"Exception: {error_msg}"
+                })
+                break
+        with open(error_file_name, "w", encoding="utf-8") as f:
+            json.dump(error_results, f, indent=2, ensure_ascii=False)
+    # Save final results
+    with open(save_file_name, "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    print(f"Results saved to {save_file_name}")
+    print(f"Total processed items: {total_count}")
+if __name__ == "__main__":
+    main()

cotSFT_new/.ipynb_checkpoints/overlaps1_output-checkpoint.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/.ipynb_checkpoints/process_transcription-checkpoint.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import json
+def seconds_to_mmss(seconds):
+    minutes = int(seconds // 60)
+    seconds = int(seconds % 60)
+    return f"{minutes:02d}:{seconds:02d}"
+filename = "correct_output"
+def is_overlapping(current_segment, other_segments):
+    """Check if the current segment overlaps with any other segment."""
+    current_start = current_segment['start_time']
+    current_end = current_segment['end_time']
+    for segment in other_segments:
+        if segment == current_segment:
+            continue
+        other_start = segment['start_time']
+        other_end = segment['end_time']
+        # Check if there's an overlap
+        if (current_start < other_end and current_end > other_start):
+            return True
+    return False
+def process_transcriptions():
+    # Read the overlap_5s_716.json file
+    with open(f'./{filename}.json', 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    # List to store results for all conversations
+    results = []
+    # Process each conversation
+    for conversation_id, conversation in data.items():
+        segments = conversation.get('segments', [])
+        audio_path = conversation.get('stereo_audio', [])
+        # Sort segments by start time
+        segments.sort(key=lambda x: x['start_time'])
+        # Process each segment and create transcription lines
+        transcription_lines = []
+        for segment in segments:
+            speaker = segment['speaker']
+            start_time = segment['start_time']
+            end_time = segment['end_time']
+            text = segment['text']
+            original_text = segment['original_text']
+            original_text = original_text.replace("[interrupt] ", "").strip()
+            # Format timestamp
+            timestamp = f"[{seconds_to_mmss(start_time)} - {seconds_to_mmss(end_time)}]"
+            # Check if this segment overlaps with any other segment
+            has_overlap = is_overlapping(segment, segments)
+            # Format the line
+            if has_overlap:
+                line = f"{timestamp} Speaker {speaker}: {original_text}"
+            else:
+                line = f"{timestamp} Speaker {speaker}: {text}"
+            transcription_lines.append(line)
+        # Create result entry
+        result = {
+            "key": conversation_id,
+            "audio_url": audio_path,
+            "model_output": "\n".join(transcription_lines)
+        }
+        results.append(result)
+    # Save the results to a JSON file
+    output_file = f'./{filename}_transcription.json'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+if __name__ == "__main__":
+    process_transcriptions()

cotSFT_new/cotSFT_10data/.ipynb_checkpoints/dataset_real_sft-checkpoint.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/cotSFT_10data/dataset_real_sft.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/cotSFT_10data/gemini2.5_metainfo.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import os
+import json
+import re
+import requests
+from tqdm import tqdm
+from datetime import datetime
+import glob
+from requests.exceptions import Timeout
+import argparse
+import multiprocessing
+prompt_template = (
+    "# Interactional Dialogue Evaluation\n\n"
+    "**IMPORTANT**: Evaluation must include `<response think>` and `<fluency think>` analysis and `<overall score>` rating.\n"
+    "Evaluate the quality of the interaction in the given dialogue transcript, focusing on:\n"
+    "**Response Relevance:** \n"
+    "**logical consistency, topic coherence**\n"
+    "**Interactional Fluency:**\n"
+    "**Detect and evaluate extended overlaps in conversation.**\n"
+    "**Detect and evaluate long pauses between speaker turns.\n\n**"
+    "**Note**: Small pauses and brief overlaps in conversation are acceptable, while prolonged pauses and overlapping turns are harmful. You should consider Response Relevance and Interactional Fluency separately, and provide the corresponding thinking process.\n\n"
+    "## Scoring Criteria\n"
+    "Assign a single holistic score based on the combined evaluation:\n"
+    "`1` (Poor): Significant issues in either  **Response Relevance ** or  **Interactional Fluency. **\n"
+    "`2` (Excellent): Both **Response Relevance ** and  **Interactional Fluency ** are consistently appropriate and natural.\n"
+    "## Evaluation Output Format:\n"
+    "Strictly follow this template:\n"
+    "<response think>\n"
+    "[Analysing Response Relevance and giving reasons for scoring...]\n"
+    "</response think>\n"
+    "<fluency think>\n"
+    "[Analysing Interactional Fluency and giving reasons for scoring.]\n"
+    "</fluency think>\n"
+    "<overall score>X</overall score>\n"
+)
+# API configuration
+url = "https://api2.aigcbest.top/v1/chat/completions"
+headers = {
+    "Authorization": "Bearer sk-yAIqUaGzzVNSesHq4mRPaCbt53MMFRJIMB97cS4FkRy6idwN",
+    "Content-Type": "application/json",
+    "Accept": "application/json"
+}
+def parse_args():
+    parser = argparse.ArgumentParser(description='Process text evaluation with Gemini model')
+    parser.add_argument('--input_file', type=str, default='all_dialogues_processed.json',
+                      help='Input JSON file containing text data')
+    parser.add_argument('--output_file', type=str, default='cotSFT_gemini.json',
+                      help='Output JSON file for results')
+    parser.add_argument('--error_file', type=str, default='cotSFT_gemini_error.json',
+                      help='Output JSON file for errors')
+    parser.add_argument('--checkpoint_dir', type=str, default='checkpoints_test_text',
+                      help='Directory for storing checkpoints')
+    parser.add_argument('--max_retries', type=int, default=6,
+                      help='Maximum number of retries for failed predictions')
+    parser.add_argument('--checkpoint_interval', type=int, default=100,
+                      help='Number of items to process before saving checkpoint')
+    parser.add_argument('--num_processes', type=int, default=5,
+                      help='Number of parallel processes to use')
+    return parser.parse_args()
+def extract_overall_score(output_str):
+    """Extract <overall score>X</overall score> from model output."""
+    score_pattern = r"<overall score>(\d+)</overall score>"
+    match = re.search(score_pattern, output_str)
+    if match:
+        try:
+            return int(match.group(1))
+        except ValueError:
+            pass
+    return None
+def validate_model_output(output_str):
+    """Validate that the model output contains all required tags"""
+    required_tags = [
+        "<response think>",
+        "</response think>",
+        "<fluency think>",
+        "</fluency think>",
+        "<overall score>",
+        "</overall score>"
+    ]
+    for tag in required_tags:
+        if tag not in output_str:
+            return False
+    return True
+def extract_tag_content(output_str, tag_name):
+    """Extract content between opening and closing tags"""
+    start_tag = f"<{tag_name}>"
+    end_tag = f"</{tag_name}>"
+    try:
+        start_idx = output_str.find(start_tag) + len(start_tag)
+        end_idx = output_str.find(end_tag)
+        if start_idx == -1 or end_idx == -1:
+            return None
+        return output_str[start_idx:end_idx].strip()
+    except:
+        return None
+def format_model_output(output_str):
+    """Extract and format content from all required tags"""
+    response_content = extract_tag_content(output_str, "response think")
+    fluency_content = extract_tag_content(output_str, "fluency think")
+    score_content = extract_tag_content(output_str, "overall score")
+    if not all([response_content, fluency_content, score_content]):
+        return None
+    formatted_output = (
+        f"<response think>\n{response_content}\n</response think>\n\n"
+        f"<fluency think>\n{fluency_content}\n</fluency think>\n\n"
+        f"<overall score>{score_content}</overall score>"
+    )
+    return formatted_output
+def make_api_call(text_input, retry_count=0, max_retries=5):
+    """Make API call with retry logic for API errors"""
+    try:
+        print(f"Attempting API call (attempt {retry_count + 1}/{max_retries + 1})")
+        data_req = {
+            "model": "gemini-2.5-pro-preview-06-05-thinking",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt_template
+                        },
+                        {
+                            "type": "text",
+                            "text": f"The correct overall score is: 2\n"
+                        },
+                        {
+                            "type": "text",
+                            "text": text_input
+                        },
+                    ]
+                }
+            ],
+            "temperature": 1,
+        }
+        response = requests.post(url, headers=headers, json=data_req, timeout=(200, 200))
+        print(f"API response received with status code: {response.status_code}")
+        if response.status_code == 200:
+            model_output = response.json()['choices'][0]['message']['content']
+            if not validate_model_output(model_output):
+                print("Model output missing required tags, retrying...")
+                return None, None
+            formatted_output = format_model_output(model_output)
+            if formatted_output is None:
+                print("Failed to extract content from tags, retrying...")
+                return None, None
+            pred_score = extract_overall_score(model_output)
+            return formatted_output, pred_score
+        else:
+            print(f"API returned error status {response.status_code}: {response.text}")
+            if retry_count >= max_retries:
+                raise Exception(f"POST error {response.status_code}: {response.text}")
+            return None, None
+    except requests.exceptions.ConnectTimeout:
+        print(f"Connection timeout (>10s)")
+        if retry_count >= max_retries:
+            raise Exception("Connection timeout")
+        return None, None
+    except requests.exceptions.ReadTimeout:
+        print(f"Read timeout (>30s)")
+        if retry_count >= max_retries:
+            raise Exception("Read timeout")
+        return None, None
+    except Exception as e:
+        print(f"Unexpected error during API call: {str(e)}")
+        if retry_count >= max_retries:
+            raise e
+        return None, None
+def get_latest_checkpoint(checkpoint_dir):
+    """Get the latest checkpoint file and its processed count"""
+    checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "checkpoint_*.json"))
+    if not checkpoint_files:
+        return None, 0
+    latest_checkpoint = None
+    max_count = 0
+    for checkpoint in checkpoint_files:
+        try:
+            count = int(os.path.basename(checkpoint).split('_')[1])
+            if count > max_count:
+                max_count = count
+                latest_checkpoint = checkpoint
+        except (ValueError, IndexError):
+            continue
+    return latest_checkpoint, max_count
+def save_checkpoint(results, processed_count, checkpoint_dir):
+    """Save results to a checkpoint file"""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    checkpoint_file = os.path.join(checkpoint_dir, f"checkpoint_{processed_count}_{timestamp}.json")
+    with open(checkpoint_file, "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    print(f"Checkpoint saved: {checkpoint_file}")
+def split_data(data, num_chunks):
+    # Split data into num_chunks as evenly as possible
+    chunk_size = len(data) // num_chunks
+    remainder = len(data) % num_chunks
+    chunks = []
+    start = 0
+    for i in range(num_chunks):
+        end = start + chunk_size + (1 if i < remainder else 0)
+        chunks.append(data[start:end])
+        start = end
+    return chunks
+def process_chunk(args_tuple):
+    chunk_data, chunk_idx, args = args_tuple
+    results = []
+    error_results = []
+    save_file_name = f"{os.path.splitext(args.output_file)[0]}_chunk{chunk_idx}.json"
+    error_file_name = f"{os.path.splitext(args.error_file)[0]}_chunk{chunk_idx}.json"
+    checkpoint_dir = f"{args.checkpoint_dir}_chunk{chunk_idx}"
+    if not os.path.exists(checkpoint_dir):
+        os.makedirs(checkpoint_dir)
+    max_prediction_retries = args.max_retries
+    total_count = 0
+    for item in tqdm(chunk_data, desc=f"Processing chunk {chunk_idx}"):
+        key = item.get('key')
+        text_input = item.get('process_dialogue')  # 使用process_dialogue字段
+        if not text_input:
+            print(f"No text input found for key {key}, skipping...")
+            continue
+        prediction_retry_count = 0
+        success = False
+        while prediction_retry_count < max_prediction_retries and not success:
+            try:
+                model_output, pred_score = make_api_call(text_input)
+                if model_output is None or pred_score is None:
+                    prediction_retry_count += 1
+                    print(f"API call failed for key {key}, retry {prediction_retry_count}/{max_prediction_retries}")
+                    continue
+                # 只有当预测分数为2时才保存结果
+                if pred_score == 2:
+                    success = True
+                    results.append({
+                        "key": key,
+                        "text_input": text_input,
+                        "model_output": model_output,
+                        "predicted_score": pred_score,
+                        "prediction_attempts": prediction_retry_count + 1
+                    })
+                    print(f"Success! Predicted score 2 for key {key} after {prediction_retry_count + 1} attempts")
+                else:
+                    prediction_retry_count += 1
+                    print(f"Predicted score {pred_score} for key {key}, retry {prediction_retry_count}/{max_prediction_retries}")
+                    if prediction_retry_count >= max_prediction_retries:
+                        print(f"Max retries reached for key {key}, saving with score {pred_score}")
+                        results.append({
+                            "key": key,
+                            "text_input": text_input,
+                            "model_output": model_output,
+                            "predicted_score": pred_score,
+                            "prediction_attempts": prediction_retry_count
+                        })
+                        success = True
+                    continue
+                # 保存当前结果
+                with open(save_file_name, "w", encoding="utf-8") as f:
+                    json.dump(results, f, indent=2, ensure_ascii=False)
+                total_count += 1
+                if total_count % args.checkpoint_interval == 0:
+                    save_checkpoint(results, total_count, checkpoint_dir)
+            except Exception as e:
+                error_msg = str(e)
+                print(f"Exception for key {key}: {error_msg}")
+                error_results.append({
+                    "key": key,
+                    "text_input": text_input,
+                    "error": f"Exception: {error_msg}"
+                })
+                break
+        # 保存错误结果
+        with open(error_file_name, "w", encoding="utf-8") as f:
+            json.dump(error_results, f, indent=2, ensure_ascii=False)
+    # 最终保存结果
+    with open(save_file_name, "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    return save_file_name, error_file_name
+def merge_json_files(file_list, output_file):
+    merged = []
+    for fname in file_list:
+        if os.path.exists(fname):
+            with open(fname, 'r', encoding='utf-8') as f:
+                merged.extend(json.load(f))
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(merged, f, indent=2, ensure_ascii=False)
+def main():
+    args = parse_args()
+    with open(args.input_file, 'r', encoding='utf-8') as f:
+        all_data = json.load(f)
+    num_chunks = args.num_processes
+    chunks = split_data(all_data, num_chunks)
+    pool = multiprocessing.Pool(num_chunks)
+    chunk_args = [(chunks[i], i, args) for i in range(num_chunks)]
+    results = pool.map(process_chunk, chunk_args)
+    pool.close()
+    pool.join()
+    # 合并所有chunk输出文件
+    output_files = [r[0] for r in results]
+    error_files = [r[1] for r in results]
+    merge_json_files(output_files, args.output_file)
+    merge_json_files(error_files, args.error_file)
+    print(f"Results saved to {args.output_file}")
+    print(f"Errors saved to {args.error_file}")
+if __name__ == "__main__":
+    main()

cotSFT_new/cotSFT_gemini.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/delay_output.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/.ipynb_checkpoints/delay_output-checkpoint.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/.ipynb_checkpoints/process_transcription-checkpoint.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import json
+def seconds_to_mmss(seconds):
+    minutes = int(seconds // 60)
+    seconds = int(seconds % 60)
+    return f"{minutes:02d}:{seconds:02d}"
+filename = "texterror_output"
+def is_overlapping(current_segment, other_segments):
+    """Check if the current segment overlaps with any other segment."""
+    current_start = current_segment['start_time']
+    current_end = current_segment['end_time']
+    for segment in other_segments:
+        if segment == current_segment:
+            continue
+        other_start = segment['start_time']
+        other_end = segment['end_time']
+        # Check if there's an overlap
+        if (current_start < other_end and current_end > other_start):
+            return True
+    return False
+def process_transcriptions():
+    # Read the overlap_5s_716.json file
+    with open(f'./{filename}.json', 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    # List to store results for all conversations
+    results = []
+    # Process each conversation
+    for conversation_id, conversation in data.items():
+        segments = conversation.get('segments', [])
+        audio_path = conversation.get('stereo_audio', [])
+        # Sort segments by start time
+        segments.sort(key=lambda x: x['start_time'])
+        # Process each segment and create transcription lines
+        transcription_lines = []
+        for segment in segments:
+            speaker = segment['speaker']
+            start_time = segment['start_time']
+            end_time = segment['end_time']
+            text = segment['text']
+            original_text = segment['original_text']
+            original_text = original_text.replace("[interrupt] ", "").strip()
+            # Format timestamp
+            timestamp = f"[{seconds_to_mmss(start_time)} - {seconds_to_mmss(end_time)}]"
+            # Check if this segment overlaps with any other segment
+            has_overlap = is_overlapping(segment, segments)
+            # Format the line
+            if has_overlap:
+                line = f"{timestamp} Speaker {speaker}: {original_text}"
+            else:
+                line = f"{timestamp} Speaker {speaker}: {text}"
+            transcription_lines.append(line)
+        # Create result entry
+        result = {
+            "key": conversation_id,
+            "audio_url": audio_path,
+            "model_output": "\n".join(transcription_lines)
+        }
+        results.append(result)
+    # Save the results to a JSON file
+    output_file = f'./{filename}_transcription.json'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+if __name__ == "__main__":
+    process_transcriptions()

cotSFT_new/filtered_output/.ipynb_checkpoints/texterror_output_transcription_gemini-checkpoint.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/alltrain/.ipynb_checkpoints/correct_output_transcription_merged_output_990-checkpoint.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/alltrain/correct_output_transcription_merged_output_990.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/alltrain/overlaps1_gemini_merged_output.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/alltrain/texterror_output_transcription_merged_output.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/correc/.ipynb_checkpoints/correct_output_transcription_gemini_error-checkpoint.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

cotSFT_new/filtered_output/correc/correct_output_transcription.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/correc/correct_output_transcription_gemini_chunk2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/correc/correct_output_transcription_gemini_chunk3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/correc/correct_output_transcription_gemini_chunk4.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/correc/correct_output_transcription_gemini_chunk6.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cotSFT_new/filtered_output/correc/correct_output_transcription_gemini_chunk7.json ADDED Viewed

The diff for this file is too large to render. See raw diff