File size: 6,472 Bytes
f52234e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | """
AIFinder Data Loader
Downloads and parses HuggingFace datasets, extracts assistant responses,
and labels them with is_ai, provider, and model.
"""
import re
import time
from datasets import load_dataset
from tqdm import tqdm
from config import (
DATASET_REGISTRY,
DEEPSEEK_AM_DATASETS,
)
def _parse_msg(msg):
"""Parse a message that may be a dict or a JSON string."""
if isinstance(msg, dict):
return msg
if isinstance(msg, str):
try:
import json
parsed = json.loads(msg)
if isinstance(parsed, dict):
return parsed
except (json.JSONDecodeError, ValueError):
pass
return {}
def _extract_assistant_texts_from_conversations(rows):
"""Extract assistant message content from conversation datasets.
These have a 'conversations' or 'messages' column with list of
{role, content} dicts (or JSON strings encoding such dicts).
"""
texts = []
for row in rows:
convos = row.get("conversations")
if convos is None or (hasattr(convos, "__len__") and len(convos) == 0):
convos = row.get("messages")
if convos is None or (hasattr(convos, "__len__") and len(convos) == 0):
convos = []
parts = []
for msg in convos:
msg = _parse_msg(msg)
role = msg.get("role", "")
content = msg.get("content", "")
if role in ("assistant", "gpt", "model") and content:
parts.append(content)
if parts:
texts.append("\n\n".join(parts))
return texts
def _extract_from_am_dataset(row):
"""Extract assistant text from a-m-team format (messages list with role/content)."""
messages = row.get("messages") or row.get("conversations") or []
parts = []
for msg in messages:
role = msg.get("role", "") if isinstance(msg, dict) else ""
content = msg.get("content", "") if isinstance(msg, dict) else ""
if role == "assistant" and content:
parts.append(content)
return "\n\n".join(parts) if parts else ""
def load_teichai_dataset(dataset_id, provider, model_name, kwargs):
"""Load a single conversation-format dataset and return (texts, providers, models)."""
max_samples = kwargs.get("max_samples")
load_kwargs = {}
if "name" in kwargs:
load_kwargs["name"] = kwargs["name"]
try:
ds = load_dataset(dataset_id, split="train", **load_kwargs)
rows = list(ds)
except Exception as e:
# Fallback: load from auto-converted parquet via HF API
try:
import pandas as pd
url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
df = pd.read_parquet(url)
rows = df.to_dict(orient="records")
except Exception as e2:
print(f" [SKIP] {dataset_id}: {e} / parquet fallback: {e2}")
return [], [], []
if max_samples and len(rows) > max_samples:
import random
random.seed(42)
rows = random.sample(rows, max_samples)
texts = _extract_assistant_texts_from_conversations(rows)
# Filter out empty/too-short texts
filtered = [(t, provider, model_name) for t in texts if len(t) > 50]
if not filtered:
print(f" [SKIP] {dataset_id}: no valid texts extracted")
return [], [], []
t, p, m = zip(*filtered)
return list(t), list(p), list(m)
def load_am_deepseek_dataset(dataset_id, provider, model_name, kwargs):
"""Load a-m-team DeepSeek dataset."""
max_samples = kwargs.get("max_samples")
load_kwargs = {}
if "name" in kwargs:
load_kwargs["name"] = kwargs["name"]
try:
ds = load_dataset(dataset_id, split="train", **load_kwargs)
except Exception as e1:
# Try without name kwarg as fallback
try:
ds = load_dataset(dataset_id, split="train", streaming=True)
rows = []
for row in ds:
rows.append(row)
if max_samples and len(rows) >= max_samples:
break
except Exception as e2:
print(f" [SKIP] {dataset_id}: {e2}")
return [], [], []
else:
rows = list(ds)
if max_samples and len(rows) > max_samples:
rows = rows[:max_samples]
texts = []
for row in rows:
text = _extract_from_am_dataset(row)
if len(text) > 50:
texts.append(text)
providers = [provider] * len(texts)
models = [model_name] * len(texts)
return texts, providers, models
def load_all_data():
"""Load all datasets and return combined lists.
Returns:
texts: list of str
providers: list of str
models: list of str
is_ai: list of int (1=AI, 0=Human)
"""
all_texts = []
all_providers = []
all_models = []
# TeichAI datasets
print("Loading TeichAI datasets...")
for dataset_id, provider, model_name, kwargs in tqdm(
DATASET_REGISTRY, desc="TeichAI"
):
t0 = time.time()
texts, providers, models = load_teichai_dataset(
dataset_id, provider, model_name, kwargs
)
elapsed = time.time() - t0
all_texts.extend(texts)
all_providers.extend(providers)
all_models.extend(models)
print(f" {dataset_id}: {len(texts)} samples ({elapsed:.1f}s)")
# DeepSeek a-m-team datasets
print("\nLoading DeepSeek (a-m-team) datasets...")
for dataset_id, provider, model_name, kwargs in tqdm(
DEEPSEEK_AM_DATASETS, desc="DeepSeek-AM"
):
t0 = time.time()
texts, providers, models = load_am_deepseek_dataset(
dataset_id, provider, model_name, kwargs
)
elapsed = time.time() - t0
all_texts.extend(texts)
all_providers.extend(providers)
all_models.extend(models)
print(f" {dataset_id}: {len(texts)} samples ({elapsed:.1f}s)")
# Build is_ai labels (all AI)
is_ai = [1] * len(all_texts)
print(f"\n=== Total: {len(all_texts)} samples ===")
# Print per-provider counts
from collections import Counter
prov_counts = Counter(all_providers)
for p, c in sorted(prov_counts.items(), key=lambda x: -x[1]):
print(f" {p}: {c}")
return all_texts, all_providers, all_models, is_ai
if __name__ == "__main__":
texts, providers, models, is_ai = load_all_data()
|