Spaces:
Sleeping
Sleeping
| from torch.utils.data import Dataset | |
| import os | |
| import json | |
| import random | |
| import hashlib | |
| def stable_long_hash(input_string): | |
| hash_object = hashlib.sha256(input_string.encode()) | |
| hex_digest = hash_object.hexdigest() | |
| int_hash = int(hex_digest, 16) | |
| long_long_hash = (int_hash & ((1 << 63) - 1)) | |
| return long_long_hash | |
| model_map_authscan = { | |
| "gpt-4o-mini-text": 1, | |
| "gemini-2.0-text": 2, | |
| "deepseek-text": 3, | |
| "llama-text": 4 | |
| } | |
| model_map_llmdetectaive = { | |
| "gemma-text": 1, | |
| "mixtral-text": 2, | |
| "llama3-text": 3 | |
| } | |
| model_map_hart = { | |
| "claude-text": 1, | |
| "gemini-text": 2, | |
| "gpt-text": 3 | |
| } | |
| def load_dataset(dataset_name,path=None): | |
| dataset = { | |
| "train": [], | |
| "valid": [], | |
| "test": [] | |
| } | |
| if dataset_name == "falconset": | |
| model_map = model_map_authscan | |
| elif dataset_name == "llmdetectaive": | |
| model_map = model_map_llmdetectaive | |
| elif dataset_name == "hart": | |
| model_map = model_map_hart | |
| folder = os.listdir(path) | |
| # print(folder) | |
| for sub in folder: | |
| sub_path = os.path.join(path, sub) | |
| files = os.listdir(sub_path) | |
| for file in files: | |
| if not file.endswith('.jsonl'): | |
| continue | |
| file_path = os.path.join(sub_path, file) | |
| key_name = file.split('.')[0] | |
| assert key_name in dataset.keys(), f'{key_name} is not in dataset.keys()' | |
| with open(file_path, 'r') as f: | |
| data = [json.loads(line) for line in f] | |
| for i in range(len(data)): | |
| dct = {} | |
| dct['text'] = data[i]['text'] | |
| if sub == "human-text": | |
| dct['label'] = "human" | |
| dct['label_detailed'] = "human" | |
| dct['index'] = (1,0,0) | |
| elif sub.startswith("human---"): | |
| dct['label'] = "human+AI" | |
| model = sub.split("---")[1] | |
| dct['label_detailed'] = model | |
| dct['index'] = (1, 1, model_map[model]) | |
| else: | |
| dct['label'] = "AI" | |
| dct['label_detailed'] = sub | |
| dct['index'] = (0, 10^3, model_map[sub]) | |
| dataset[key_name].append(dct) | |
| return dataset | |
| def load_outdomain_dataset(path): | |
| dataset = { | |
| "valid": [], | |
| "test": [] | |
| } | |
| folder = os.listdir(path) | |
| for sub in folder: | |
| sub_path = os.path.join(path, sub) | |
| files = os.listdir(sub_path) | |
| for file in files: | |
| if not file.endswith('.jsonl'): | |
| continue | |
| file_path = os.path.join(sub_path, file) | |
| key_name = file.split('.')[0] | |
| assert key_name in dataset.keys(), f'{key_name} is not in dataset.keys()' | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = [json.loads(line) for line in f] | |
| for i in range(len(data)): | |
| dct = {} | |
| dct['text'] = data[i]['text'] | |
| if sub == "human-text": | |
| dct['label'] = "human" | |
| dct['label_detailed'] = "human" | |
| dct['index'] = (1,0) | |
| elif sub.startswith("human---"): | |
| dct['label'] = "human+AI" | |
| model = sub.split("---")[1] | |
| dct['label_detailed'] = model | |
| dct['index'] = (1, 1) | |
| else: | |
| dct['label'] = "AI" | |
| dct['label_detailed'] = sub | |
| dct['index'] = (0, 10^3) | |
| dataset[key_name].append(dct) | |
| return dataset | |
| def load_dataset_conditional_lang(path=None, language='vi', seed=42): | |
| dataset = { | |
| "train": [], | |
| "val": [], | |
| "test": [] | |
| } | |
| combined_data = [] | |
| random.seed(seed) # for reproducibility | |
| folder = os.listdir(path) | |
| print("Subfolders:", folder) | |
| for sub in folder: | |
| sub_path = os.path.join(path, sub) | |
| if not os.path.isdir(sub_path): | |
| continue | |
| files = os.listdir(sub_path) | |
| for file in files: | |
| if not file.endswith('.jsonl') or language not in file: | |
| continue | |
| file_path = os.path.join(sub_path, file) | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = [json.loads(line) for line in f] | |
| for entry in data: | |
| if 'content' not in entry: | |
| print("Key does not exist!") | |
| continue | |
| dct = {} | |
| dct['text'] = entry['content'] | |
| if sub == "human": | |
| dct['label'] = "human" | |
| dct['label_detailed'] = "human" | |
| dct['index'] = (1, 0, 0) | |
| elif sub == "human+AI": | |
| model = entry['label_detailed'].split("+")[1] | |
| dct['label'] = "human+AI" | |
| dct['label_detailed'] = model | |
| dct['index'] = (1, 1, model_map[model]) | |
| else: | |
| dct['label'] = "AI" | |
| dct['label_detailed'] = entry['label_detailed'] | |
| dct['index'] = (0, 10**3, model_map[entry['label_detailed']]) | |
| combined_data.append(dct) | |
| random.shuffle(combined_data) | |
| total = len(combined_data) | |
| train_end = int(total * 0.9) | |
| val_end = train_end + int(total * 0.05) | |
| dataset['train'] = combined_data[:train_end] | |
| dataset['val'] = combined_data[train_end:val_end] | |
| dataset['test'] = combined_data[val_end:] | |
| print(f"Total: {total} | Train: {len(dataset['train'])} | Val: {len(dataset['val'])} | Test: {len(dataset['test'])}") | |
| return dataset | |
| class TextDataset(Dataset): | |
| def __init__(self, dataset,need_ids=True,out_domain=0): | |
| self.dataset = dataset | |
| self.need_ids=need_ids | |
| self.out_domain = out_domain | |
| def get_class(self): | |
| return self.classes | |
| def __len__(self): | |
| return len(self.dataset) | |
| def __getitem__(self, idx): | |
| text, label, label_detailed, index = self.dataset[idx].values() | |
| id = stable_long_hash(text) | |
| if self.out_domain: | |
| label, is_mixed = index | |
| if self.need_ids: | |
| return int(id), text, int(label), int(is_mixed) | |
| return text, int(label), int(is_mixed) | |
| else: | |
| label, is_mixed, write_model = index | |
| if self.need_ids: | |
| return int(id), text, int(label), int(is_mixed), int(write_model) | |
| return text, int(label), int(is_mixed), int(write_model) | |