one-piece-analysis / utils /data_loader.py
Fluoron's picture
Upload folder using huggingface_hub
97859e4 verified
Raw
History Blame Contribute Delete
1.98 kB
import re
from pathlib import Path
import pandas as pd
def load_subtitles_dataset(dataset_path):
dataset_dir = Path(dataset_path).expanduser()
subtitles_paths = [
path
for path in dataset_dir.iterdir()
if path.is_file() and path.suffix.lower() in {'.ass', '.srt', '.txt'}
]
scripts = []
episode_num = []
source_files = []
for path in subtitles_paths:
with open(path, 'r', encoding='utf-8-sig', errors='ignore') as file:
lines = file.readlines()
if path.suffix.lower() == '.ass':
# Filter on the 'Dialogue:' prefix directly — robust to header length.
# (Hardcoding lines[27:] silently dropped dialogue when a file's
# [Script Info]/[V4+ Styles] header was shorter than 27 lines.)
lines = [",".join(line.split(',')[9:]) for line in lines if line.startswith('Dialogue:')]
lines = [line.replace('\\N', ' ').replace('\\n', ' ') for line in lines]
elif path.suffix.lower() == '.srt':
# SRT format: strip timestamps, sequence numbers, and blank lines
lines = [line.strip() for line in lines]
lines = [line for line in lines if line
and not line.isdigit()
and '-->' not in line]
else:
lines = [line.strip() for line in lines if line.strip()]
script = " ".join(lines)
# Robust episode number extraction: grab the last number in the filename.
numbers = re.findall(r'\d+', path.name)
episode = int(numbers[-1]) if numbers else 0
if script.strip():
scripts.append(script)
episode_num.append(episode)
source_files.append(path.name)
df = pd.DataFrame.from_dict({
"episode": episode_num,
"script": scripts,
"source_file": source_files,
})
df = df.sort_values(["episode", "source_file"]).reset_index(drop=True)
return df