Spaces:
Running on Zero
Running on Zero
Widen DP lookback window and fix null-typed parquet columns
Browse filesIncrease LOOKBACK_WORDS from 15 to 20 for better alignment coverage
on segments that start further back from the expected position. Cast
null-typed Arrow columns to string in usage logger so all parquet
shards share a consistent schema, preventing HF dataset viewer errors.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- config.py +1 -1
- src/core/usage_logger.py +10 -0
config.py
CHANGED
|
@@ -128,7 +128,7 @@ COST_INSERTION = 1.0 # Insert phoneme from reference (R)
|
|
| 128 |
COST_DELETION = 0.8 # Delete phoneme from ASR (P)
|
| 129 |
|
| 130 |
# Alignment thresholds (normalized edit distance: 0 = identical, 1 = completely different)
|
| 131 |
-
LOOKBACK_WORDS =
|
| 132 |
LOOKAHEAD_WORDS = 10 # Window words to look ahead after expected end position
|
| 133 |
MAX_EDIT_DISTANCE = 0.25 # Max normalized edit distance for valid ayah match
|
| 134 |
MAX_SPECIAL_EDIT_DISTANCE = 0.35 # Max normalized edit distance for Basmala/Isti'adha detection
|
|
|
|
| 128 |
COST_DELETION = 0.8 # Delete phoneme from ASR (P)
|
| 129 |
|
| 130 |
# Alignment thresholds (normalized edit distance: 0 = identical, 1 = completely different)
|
| 131 |
+
LOOKBACK_WORDS = 20 # Window words to look back from pointer for starting positions
|
| 132 |
LOOKAHEAD_WORDS = 10 # Window words to look ahead after expected end position
|
| 133 |
MAX_EDIT_DISTANCE = 0.25 # Max normalized edit distance for valid ayah match
|
| 134 |
MAX_SPECIAL_EDIT_DISTANCE = 0.35 # Max normalized edit distance for Basmala/Isti'adha detection
|
src/core/usage_logger.py
CHANGED
|
@@ -165,6 +165,16 @@ if _HAS_DEPS:
|
|
| 165 |
row[feature] = None
|
| 166 |
|
| 167 |
table = pa.Table.from_pylist(rows)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
table = table.replace_schema_metadata(
|
| 169 |
{"huggingface": json.dumps({"info": {"features": schema}})}
|
| 170 |
)
|
|
|
|
| 165 |
row[feature] = None
|
| 166 |
|
| 167 |
table = pa.Table.from_pylist(rows)
|
| 168 |
+
|
| 169 |
+
# Cast null-typed columns to string so all parquet shards share
|
| 170 |
+
# the same Arrow schema (prevents HF viewer concat errors).
|
| 171 |
+
for i, field in enumerate(table.schema):
|
| 172 |
+
if pa.types.is_null(field.type):
|
| 173 |
+
table = table.set_column(
|
| 174 |
+
i, field.name,
|
| 175 |
+
pa.array([None] * len(table), type=pa.string()),
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
table = table.replace_schema_metadata(
|
| 179 |
{"huggingface": json.dumps({"info": {"features": schema}})}
|
| 180 |
)
|