maiChartGen / mai_parser /parser.py
Goldgom's picture
Upload MaiGenerator model (epoch 10) and inference code
8061544
Raw
History Blame Contribute Delete
16.5 kB
"""
Parser for maidata.txt files (MaichartConverter format).
Parses:
1. Header key-value pairs (&key=value)
2. Note sequence data (&inote_N= blocks)
The note format is measure-based:
{beat_div}pos1,pos2,pos3,...
Where each position can be:
- A number 1-8 (button position)
- Empty (rest)
- Number/Number (simultaneous tap)
- Number + modifiers (h=hold, b=break, x=firework, >=/</V/slide patterns)
- Touch region (C, B1-B8, E1-E8, A1-A8, D1-D8)
- End marker: "E"
"""
from __future__ import annotations
import re
from pathlib import Path
from typing import Optional
from .models import Cabinet, Chart, Difficulty, Song, TouchNote
# ─── Regex patterns for note parsing ────────────────────────────────────────
# Matches {beat_div} at the start of a measure
_RE_BEAT_DIV = re.compile(r"\{(\d+)\}")
# Matches inline BPM changes: (200)
_RE_BPM_CHANGE = re.compile(r"\((\d+(?:\.\d+)?)\)")
# Matches hold duration: h[beat:subdiv]
_RE_HOLD = re.compile(r"h\[(\d+):(\d+)\]")
# Matches slide pattern with duration: >, <, -, V, v, ^
# Examples: 3>6[4:1], 6<3[8:11], 3-6[4:1], 3v6[16:3], 5>2[8:11], 3>8-6v3[8:9]
_RE_SLIDE_DUR = re.compile(r"([><Vv\-\^])(\d+)\[(\d+):(\d+)\]")
# Matches simple slide (no duration): 3>6, 4-1
_RE_SIMPLE_SLIDE = re.compile(r"(\d+)[><Vv\-\^](\d+)")
# Matches break suffix: b or $b at end
_RE_BREAK = re.compile(r"b$|\$b$")
# Matches firework/EX: x suffix
_RE_FIREWORK = re.compile(r"x$")
# Matches touch note identifiers: C, Cf, Ch, B1-B8, E1-E8, A1-A8, D1-D8
_RE_TOUCH = re.compile(r"^(C[fh]?|[ABDE][1-8])$")
# Matches simultaneous tap: 1/8, 2/5/7
_RE_SIMUL = re.compile(r"^(\d+)/(\d+)(?:/(\d+))?(?:/(\d+))?$")
# Matches a number position (1-8)
_RE_POS = re.compile(r"^\d+$")
def parse_header_line(line: str) -> tuple[str, str]:
"""Parse a single &key=value header line. Returns (key, value)."""
line = line.strip()
if line.startswith("&"):
line = line[1:] # strip &
if "=" in line:
key, value = line.split("=", 1)
return key.strip(), value.strip()
return line.strip(), ""
def _extract_bpm_changes(text: str) -> list[tuple[int, float]]:
"""Extract inline BPM changes from a measure line.
Returns list of (position_in_measure, bpm_value).
"""
changes = []
for m in _RE_BPM_CHANGE.finditer(text):
changes.append((m.start(), float(m.group(1))))
return changes
def _parse_single_note(token: str, current_beat_div: int) -> TouchNote:
"""
Parse a single note token (one comma-separated segment).
Examples:
"" → rest
"E" → end marker
"1" → tap at position 1
"1/8" → simultaneous tap at 1 and 8
"1b" → break at position 1
"1x" → firework at position 1
"1h[4:1]" → hold at position 1, duration 4:1
"3>6[4:1]"→ slide from 3 to 6, duration 4:1
"3-6" → simple slide 3→6
"C" → touch center
"B7/B6" → touch simultaneous
"Ch[2:1]" → touch hold
"""
note = TouchNote(beat_div=current_beat_div, raw=token)
# Empty = rest
if not token:
note.is_rest = True
return note
# End marker
if token.strip() == "E":
note.is_end = True
return note
# Work on the cleaned token (remove beat division, BPM)
t = token.strip()
# Remove inline BPM changes for parsing: (200), (150.5)
t = _RE_BPM_CHANGE.sub("", t).strip()
if not t:
note.is_rest = True
return note
# --- Check for touch notes ---
# Touch notes can have modifiers like Ch[2:1], C, B7/B6
# They use letters A-E + number, or just C
touch_parts = re.split(r"[/]", t)
all_touch = all(
_RE_TOUCH.match(re.sub(r"\[.*?\]", "", p).strip("hbfxqpb$Vv*^-><"))
for p in touch_parts
)
if all_touch and any(
re.match(r"^[A-E]", p.strip("hbfxqpb$Vv*^-><"))
for p in touch_parts
):
note.is_touch = True
# Extract touch regions (strip modifiers)
for p in touch_parts:
clean = re.sub(r"\[.*?\]", "", p).strip("hbfxqpb$Vv*^-><")
if clean:
note.touch_regions.append(clean)
# Parse hold/touch-hold
hold_m = _RE_HOLD.search(t)
if hold_m:
note.is_hold = True
note.hold_duration = (int(hold_m.group(1)), int(hold_m.group(2)))
return note
# --- Parse button note ---
# Break suffix
if _RE_BREAK.search(t):
note.is_break = True
t = _RE_BREAK.sub("", t).rstrip("$")
# Firework suffix
if t.endswith("x"):
note.is_firework = True
t = t[:-1]
# Hold
hold_m = _RE_HOLD.search(t)
if hold_m:
note.is_hold = True
note.hold_duration = (int(hold_m.group(1)), int(hold_m.group(2)))
t = _RE_HOLD.sub("", t)
# Star/EX note (* suffix)
if t.endswith("*"):
note.is_star = True
t = t[:-1]
# Slide patterns
# Pattern with duration: 3>6[4:1], 3-6[8:11], 3v6[16:3], 3>8-6v3[8:9]
slide_m = _RE_SLIDE_DUR.search(t)
if slide_m:
note.is_slide = True
note.slide_path = _extract_slide_path(t)
hold_m2 = _RE_HOLD.search(t)
if not hold_m2:
# Duration is on slide: [beat:subdiv]
note.hold_duration = (int(slide_m.group(3)), int(slide_m.group(4)))
# Extract positions from slide
pos_match = re.findall(r"(\d+)", t)
note.positions = [int(p) for p in pos_match[:4]]
return note
# Simple slide: 3>6, 4-1, 8<5
simple_slide = _RE_SIMPLE_SLIDE.search(t)
if simple_slide:
note.is_slide = True
pos_match = re.findall(r"(\d+)", t)
note.positions = [int(p) for p in pos_match[:4]]
return note
# Single V (slide continuation / endpoint)
if re.match(r"^V\d*$", t):
note.is_slide = True
return note
# Simultaneous tap: 1/8, 2/6/7
simul_m = _RE_SIMUL.match(t)
if simul_m:
note.is_simultaneous = True
note.positions = [int(g) for g in simul_m.groups() if g is not None]
return note
# Simple tap: 1-8
if _RE_POS.match(t):
note.positions = [int(t)]
return note
# Special modifiers: 'q', 'p', 'w', 'z', '$' etc. — treat as tap if has number
num_match = re.findall(r"(\d+)", t)
if num_match:
note.positions = [int(p) for p in num_match[:4]]
return note
# Unknown / rest
note.is_rest = True
return note
def _extract_slide_path(token: str) -> list[int]:
"""Extract all position numbers from a slide pattern like '3>8-6v3[8:9]' → [3,8,6,3]."""
nums = re.findall(r"(?<![a-zA-Z\[:])(\d+)(?![a-zA-Z\]])", token)
return [int(n) for n in nums]
def _parse_note_sequence(raw_text: str) -> list[TouchNote]:
"""
Parse the full note sequence from an &inote_N block.
Handles:
- Beat division changes: {4}, {8}, {16}, etc.
- Inline BPM changes: (200)
- Measure-by-measure comma-separated notes
- End marker: E
"""
notes: list[TouchNote] = []
current_div = 4 # default beat division
# Normalize: join continuation lines, split by newline
text = raw_text.strip()
for line in text.split("\n"):
line = line.strip()
if not line:
continue
# Check for beat division change at start of line
bd_match = _RE_BEAT_DIV.match(line)
if bd_match:
current_div = int(bd_match.group(1))
line = line[bd_match.end():] # remove {beat_div} prefix
# If the line is empty after removing beat div, it's just a beat division
# announcement; subsequent lines use this division
if not line:
continue
# The line may start with an inline BPM change like (173){1},
bpm_match = _RE_BPM_CHANGE.match(line)
if bpm_match:
line = line[bpm_match.end():]
# Check for beat div inside the BPM line
bd2 = _RE_BEAT_DIV.match(line) if line else None
if bd2:
current_div = int(bd2.group(1))
line = line[bd2.end():]
if not line:
continue
# Split by commas for individual notes
tokens = line.split(",")
for token in tokens:
# Check for inline beat division change
bd_inline = _RE_BEAT_DIV.search(token) if token else None
local_div = current_div
if bd_inline:
local_div = int(bd_inline.group(1))
token = _RE_BEAT_DIV.sub("", token).strip()
note = _parse_single_note(token, local_div)
notes.append(note)
if note.is_end:
return notes # stop at E marker
return notes
def parse_level_value(level_str: str) -> tuple[float, bool, bool]:
"""
Parse a level string.
Returns (numeric_value, is_plus, is_ura).
Examples:
"12.4" → (12.4, False, False)
"13+" → (13.0, True, False)
"14.6?" → (14.6, False, True)
"7+" → (7.0, True, False)
"耐" → (-1.0, False, False) # non-numeric
"" → (0.0, False, False)
"""
is_plus = "+" in level_str
is_ura = "?" in level_str
# Strip non-numeric except . + ?
clean = level_str.replace("+", "").replace("?", "").strip()
try:
val = float(clean) if clean else 0.0
except ValueError:
val = -1.0
return val, is_plus, is_ura
def parse_maidata(content: str, song_id: str = "",
maidata_path: str = "", audio_path: str = "") -> Song:
"""
Parse a complete maidata.txt content string into a Song object.
Args:
content: The full text content of maidata.txt
song_id: Folder name / song identifier
maidata_path: Relative path to maidata.txt
audio_path: Relative path to track.mp3
Returns:
Song object with all parsed data
"""
song = Song(song_id=song_id, maidata_path=maidata_path, audio_path=audio_path)
# Strip BOM if present
if content.startswith("\ufeff"):
content = content[1:]
lines = content.split("\n")
raw_notes: dict[int, str] = {} # difficulty_index → raw note text
current_note_idx: Optional[int] = None
current_note_lines: list[str] = []
for line in lines:
line_stripped = line.strip()
if not line_stripped:
continue
if line_stripped.startswith("&"):
key, value = parse_header_line(line_stripped)
# If we were collecting note data, flush it
if current_note_idx is not None:
raw_notes[current_note_idx] = "\n".join(current_note_lines)
current_note_idx = None
current_note_lines = []
if key.startswith("inote_"):
# Note data block
idx_str = key.replace("inote_", "")
try:
current_note_idx = int(idx_str)
except ValueError:
current_note_idx = None
if value:
current_note_lines.append(value)
elif key.startswith("lv_"):
idx_str = key.replace("lv_", "")
if not idx_str.isdigit():
continue
idx = int(idx_str)
if not value:
continue # skip empty levels (no chart)
song.levels[idx] = value
level_val, is_plus, is_ura = parse_level_value(value)
if level_val > 0:
# Create chart placeholder (notes parsed later)
diff = Difficulty.from_index(idx)
chart = Chart(
difficulty_index=idx,
difficulty=diff,
level=value,
level_value=level_val,
is_plus=is_plus,
is_ura=is_ura,
)
song.charts[idx] = chart
elif key.startswith("des_"):
idx_str = key.replace("des_", "")
try:
idx = int(idx_str)
except ValueError:
continue
song.charters[idx] = value
if idx in song.charts:
song.charts[idx].charter = value
elif key == "title":
song.title = value
song.title_clean = _clean_title(value)
song.tags = _extract_tags(value)
song.is_full = "[FULL]" in value or "_FULLTOUCH" in value
song.is_fulltouch = "_FULLTOUCH" in value
elif key == "artist":
song.artist = value
elif key == "artistid":
try:
song.artist_id = int(value)
except ValueError:
pass
elif key == "wholebpm":
try:
song.bpm = float(value)
except ValueError:
pass
elif key == "first":
try:
song.first = float(value)
except ValueError:
pass
elif key == "genre":
song.genre = value
song.is_utage = "宴会場" in value
elif key == "genreid":
try:
song.genre_id = int(value)
except ValueError:
pass
elif key == "cabinet":
try:
song.cabinet = Cabinet(value.upper())
except ValueError:
song.cabinet = Cabinet.UNKNOWN
elif key == "version":
song.version = value
elif key == "shortid":
try:
song.short_id = int(value)
except ValueError:
pass
elif key == "des":
song.description = value
elif key == "ChartConverter":
song.converter = value
elif key == "ChartConvertTool":
song.converter_tool = value
elif key == "ChartConvertToolVersion":
song.converter_version = value
elif current_note_idx is not None:
# Collect note data lines
current_note_lines.append(line_stripped)
# Flush last note block
if current_note_idx is not None:
raw_notes[current_note_idx] = "\n".join(current_note_lines)
# ── Parse all collected note sequences ──
for idx, raw in raw_notes.items():
notes = _parse_note_sequence(raw)
if idx in song.charts:
song.charts[idx].notes = notes
song.charts[idx].compute_stats()
elif idx in song.levels and song.levels[idx]:
# Chart has notes + level but was skipped due to level_val being non-numeric (e.g. "耐")
level_val, is_plus, is_ura = parse_level_value(song.levels[idx])
diff = Difficulty.from_index(idx)
chart = Chart(
difficulty_index=idx,
difficulty=diff,
level=song.levels[idx],
level_value=level_val,
is_plus=is_plus,
is_ura=is_ura,
)
chart.notes = notes
chart.compute_stats()
song.charts[idx] = chart
return song
def _clean_title(title: str) -> str:
"""Remove bracket tags like [SD], [DX], [宴] from title."""
return re.sub(r"\[.*?\]", "", title).strip()
def _extract_tags(title: str) -> list[str]:
"""Extract bracket tags from title, e.g. [SD], [DX], [宴]."""
return re.findall(r"\[(.*?)\]", title)
def parse_maidata_file(filepath: str | Path) -> Song:
"""
Parse a maidata.txt file from disk.
Args:
filepath: Path to the maidata.txt file
Returns:
Parsed Song object
"""
filepath = Path(filepath)
song_id = filepath.parent.name
content = filepath.read_text(encoding="utf-8")
# Build relative paths
maidata_rel = filepath.name
audio_rel = "track.mp3"
return parse_maidata(
content,
song_id=song_id,
maidata_path=maidata_rel,
audio_path=audio_rel,
)