File size: 3,116 Bytes
2cbfbf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/**
 * Language instruction extraction utilities
 * Consolidates duplicated logic from fetch-data.ts
 */

/**
 * Extract language instructions from episode data rows
 * Consolidates logic from lines 232-258 and 573-626 in fetch-data.ts
 *
 * This function checks for language_instruction fields in the provided rows.
 * It supports both single and numbered language instruction fields
 * (language_instruction, language_instruction_2, language_instruction_3, etc.)
 *
 * @param episodeData - Array of episode data rows
 * @param sampleIndices - Indices of rows to check (default: [0] for first row only)
 * @returns Concatenated language instructions or undefined if none found
 */
export function extractLanguageInstructions(
  episodeData: Record<string, unknown>[],
  sampleIndices: number[] = [0],
): string | undefined {
  if (episodeData.length === 0) return undefined;

  const languageInstructions: string[] = [];

  // Check specified rows for instructions
  for (const idx of sampleIndices) {
    if (idx >= episodeData.length) continue;

    const row = episodeData[idx];

    // Check for primary language_instruction field
    if (
      "language_instruction" in row &&
      typeof row.language_instruction === "string" &&
      row.language_instruction
    ) {
      languageInstructions.push(row.language_instruction);

      // Check for numbered fields (language_instruction_2, _3, etc.)
      let instructionNum = 2;
      let key = `language_instruction_${instructionNum}`;
      while (key in row && typeof row[key] === "string") {
        languageInstructions.push(row[key] as string);
        instructionNum++;
        key = `language_instruction_${instructionNum}`;
      }

      // If we found instructions, stop searching other indices
      if (languageInstructions.length > 0) break;
    }
  }

  return languageInstructions.length > 0
    ? languageInstructions.join("\n")
    : undefined;
}

/**
 * Extract task from task_index by looking up in tasks metadata
 * Helper function for task extraction with proper type handling
 *
 * @param taskIndex - Task index (can be BigInt or number)
 * @param tasksData - Array of task metadata objects
 * @returns Task string or undefined if not found
 */
export function extractTaskFromMetadata(
  taskIndex: unknown,
  tasksData: Record<string, unknown>[],
): string | undefined {
  // Convert BigInt to number for comparison
  const taskIndexNum =
    typeof taskIndex === "bigint"
      ? Number(taskIndex)
      : typeof taskIndex === "number"
        ? taskIndex
        : undefined;

  if (taskIndexNum === undefined || taskIndexNum < 0) {
    return undefined;
  }

  if (taskIndexNum >= tasksData.length) {
    return undefined;
  }

  const taskData = tasksData[taskIndexNum];

  // Extract task from various possible fields
  if (
    taskData &&
    "__index_level_0__" in taskData &&
    typeof taskData.__index_level_0__ === "string"
  ) {
    return taskData.__index_level_0__;
  } else if (
    taskData &&
    "task" in taskData &&
    typeof taskData.task === "string"
  ) {
    return taskData.task;
  }

  return undefined;
}