hetchyy Claude Opus 4.6 commited on
Commit
632df05
·
1 Parent(s): e3c24fc

Widen DP lookback window and fix null-typed parquet columns

Browse files

Increase LOOKBACK_WORDS from 15 to 20 for better alignment coverage
on segments that start further back from the expected position. Cast
null-typed Arrow columns to string in usage logger so all parquet
shards share a consistent schema, preventing HF dataset viewer errors.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. config.py +1 -1
  2. src/core/usage_logger.py +10 -0
config.py CHANGED
@@ -128,7 +128,7 @@ COST_INSERTION = 1.0 # Insert phoneme from reference (R)
128
  COST_DELETION = 0.8 # Delete phoneme from ASR (P)
129
 
130
  # Alignment thresholds (normalized edit distance: 0 = identical, 1 = completely different)
131
- LOOKBACK_WORDS = 15 # Window words to look back from pointer for starting positions
132
  LOOKAHEAD_WORDS = 10 # Window words to look ahead after expected end position
133
  MAX_EDIT_DISTANCE = 0.25 # Max normalized edit distance for valid ayah match
134
  MAX_SPECIAL_EDIT_DISTANCE = 0.35 # Max normalized edit distance for Basmala/Isti'adha detection
 
128
  COST_DELETION = 0.8 # Delete phoneme from ASR (P)
129
 
130
  # Alignment thresholds (normalized edit distance: 0 = identical, 1 = completely different)
131
+ LOOKBACK_WORDS = 20 # Window words to look back from pointer for starting positions
132
  LOOKAHEAD_WORDS = 10 # Window words to look ahead after expected end position
133
  MAX_EDIT_DISTANCE = 0.25 # Max normalized edit distance for valid ayah match
134
  MAX_SPECIAL_EDIT_DISTANCE = 0.35 # Max normalized edit distance for Basmala/Isti'adha detection
src/core/usage_logger.py CHANGED
@@ -165,6 +165,16 @@ if _HAS_DEPS:
165
  row[feature] = None
166
 
167
  table = pa.Table.from_pylist(rows)
 
 
 
 
 
 
 
 
 
 
168
  table = table.replace_schema_metadata(
169
  {"huggingface": json.dumps({"info": {"features": schema}})}
170
  )
 
165
  row[feature] = None
166
 
167
  table = pa.Table.from_pylist(rows)
168
+
169
+ # Cast null-typed columns to string so all parquet shards share
170
+ # the same Arrow schema (prevents HF viewer concat errors).
171
+ for i, field in enumerate(table.schema):
172
+ if pa.types.is_null(field.type):
173
+ table = table.set_column(
174
+ i, field.name,
175
+ pa.array([None] * len(table), type=pa.string()),
176
+ )
177
+
178
  table = table.replace_schema_metadata(
179
  {"huggingface": json.dumps({"info": {"features": schema}})}
180
  )