humigencev2 / utils /dataset_loader.py
lilbablo's picture
chore: initial public release of Humigence (CLI wizard + dual-GPU fine-tuning)
7275aef
# utils/dataset_loader.py
import os
from pathlib import Path
from typing import List, Tuple, Union
def list_local_datasets(base_dir: str = "~/humigence_data") -> List[Tuple[str, str, Union[int, str]]]:
"""
List all local datasets in the specified directory.
Args:
base_dir: Base directory to search for datasets (default: ~/humigence_data)
Returns:
List of tuples containing (name, path, count) for each dataset
where count is the number of lines in the JSONL file or "?" if error
"""
base = os.path.expanduser(base_dir)
datasets = []
if not os.path.exists(base):
return datasets
try:
for f in os.listdir(base):
if f.endswith(".jsonl"):
path = os.path.join(base, f)
try:
# Count lines in the file
with open(path, "r", encoding='utf-8') as infile:
count = sum(1 for _ in infile)
except Exception:
count = "?"
# Extract name without extension
name = os.path.splitext(f)[0]
datasets.append((name, path, count))
except Exception:
# If there's any error accessing the directory, return empty list
pass
return datasets