File size: 4,791 Bytes
a938ec7
 
 
 
 
 
 
 
 
 
 
 
 
4466506
 
 
a938ec7
 
 
 
 
 
4dcf807
a938ec7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dcf807
a938ec7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4466506
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# DEPENDENCIES
import json
from pathlib import Path


# Count actual samples
def count_samples(dir_path: Path) -> int:
    total = 0

    if dir_path.exists():
        for domain_dir in dir_path.iterdir():
            if domain_dir.is_dir():
                total += len(list(domain_dir.glob("*.txt")))

            else:
                total += len(list("*.txt"))
    
    return total


human_count       = count_samples(Path("evaluation/human"))
ai_count          = count_samples(Path("evaluation/ai_generated"))
paraphrased_count = count_samples(Path("evaluation/adversarial/paraphrased")) 
cross_model_count = count_samples(Path("evaluation/adversarial/cross_model"))

metadata          = {"dataset_name"      : "TEXT-AUTH-Eval",
                     "version"           : "1.0",
                     "total_samples"     : human_count + ai_count + paraphrased_count + cross_model_count,
                     "human_samples"     : human_count,
                     "ai_samples"        : ai_count,
                     "challenge_samples" : {"paraphrased" : paraphrased_count,
                                            "cross_model" : cross_model_count,
                                           },
                     "domains"           : ["general", "academic", "creative", "ai_ml", "software_dev", "technical_doc", "engineering", "science", "business", "legal", "medical", "journalism", "marketing", "social_media", "blog_personal", "tutorial"],
                     "human_sources"     : {"general"       : "Wikipedia",
                                            "academic"      : "scientific_papers (arXiv abstracts)",
                                            "creative"      : "Project Gutenberg / C4 filtered",
                                            "ai_ml"         : "scientific_papers (arXiv with ML keywords)",
                                            "software_dev"  : "C4 filtered (code/documentation keywords)",
                                            "technical_doc" : "C4 filtered (documentation keywords)",
                                            "engineering"   : "scientific_papers (arXiv engineering)",
                                            "science"       : "C4 filtered (scientific keywords)",
                                            "business"      : "C4 filtered (business/financial keywords)",
                                            "legal"         : "lex_glue / C4 filtered (legal keywords)",
                                            "medical"       : "scientific_papers (PubMed abstracts)",
                                            "journalism"    : "cnn_dailymail",
                                            "marketing"     : "C4 filtered (marketing keywords)",
                                            "social_media"  : "tweet_eval / C4 filtered (social keywords)",
                                            "blog_personal" : "C4 filtered (personal narrative keywords)",
                                            "tutorial"      : "C4 filtered (tutorial/guide keywords)",
                                           },
                     "ai_generation"     : {"primary_model" : "mistral:7b (via Ollama)",
                                            "cross_model"   : "llama3:8b (via Ollama)",
                                            "paraphrasing"  : "mistral:7b (via Ollama instruction-based rephrasing)",
                                           },
                     "notes"             : ["All AI-generated texts produced using local Ollama models to avoid Hugging Face downloads",
                                            "Paraphrased set created by instructing mistral:7b to rephrase original AI texts",
                                            "Cross-model set generated using llama3:8b (unseen during primary AI generation)",
                                            "Human texts sourced exclusively from public, auto-downloadable datasets",
                                           ],
                     "license"           : "CC BY / Public Domain / Fair Use — for research only",
                     "created"           : "2025",
                     "compatible_with"   : "TEXT-AUTH v1.0.0",
                    }


# Save to evaluation directory
output_path = Path("evaluation/metadata.json")

with open(output_path, "w") as f:
    json.dump(obj    = metadata, 
              fp     = f, 
              indent = 4,
             )


print(f"metadata.json saved to {output_path}")
print(f"Dataset Summary:")
print(f"   Human:       {human_count}")
print(f"   AI:          {ai_count}")
print(f"   Paraphrased: {paraphrased_count}")
print(f"   Cross-Model: {cross_model_count}")
print(f"   TOTAL:       {human_count + ai_count + paraphrased_count + cross_model_count}")