| | |
| | import json |
| | from pathlib import Path |
| |
|
| |
|
| | |
| | def count_samples(dir_path: Path) -> int: |
| | total = 0 |
| |
|
| | if dir_path.exists(): |
| | for domain_dir in dir_path.iterdir(): |
| | if domain_dir.is_dir(): |
| | total += len(list(domain_dir.glob("*.txt"))) |
| |
|
| | else: |
| | total += len(list("*.txt")) |
| | |
| | return total |
| |
|
| |
|
| | human_count = count_samples(Path("evaluation/human")) |
| | ai_count = count_samples(Path("evaluation/ai_generated")) |
| | paraphrased_count = count_samples(Path("evaluation/adversarial/paraphrased")) |
| | cross_model_count = count_samples(Path("evaluation/adversarial/cross_model")) |
| |
|
| | metadata = {"dataset_name" : "TEXT-AUTH-Eval", |
| | "version" : "1.0", |
| | "total_samples" : human_count + ai_count + paraphrased_count + cross_model_count, |
| | "human_samples" : human_count, |
| | "ai_samples" : ai_count, |
| | "challenge_samples" : {"paraphrased" : paraphrased_count, |
| | "cross_model" : cross_model_count, |
| | }, |
| | "domains" : ["general", "academic", "creative", "ai_ml", "software_dev", "technical_doc", "engineering", "science", "business", "legal", "medical", "journalism", "marketing", "social_media", "blog_personal", "tutorial"], |
| | "human_sources" : {"general" : "Wikipedia", |
| | "academic" : "scientific_papers (arXiv abstracts)", |
| | "creative" : "Project Gutenberg / C4 filtered", |
| | "ai_ml" : "scientific_papers (arXiv with ML keywords)", |
| | "software_dev" : "C4 filtered (code/documentation keywords)", |
| | "technical_doc" : "C4 filtered (documentation keywords)", |
| | "engineering" : "scientific_papers (arXiv engineering)", |
| | "science" : "C4 filtered (scientific keywords)", |
| | "business" : "C4 filtered (business/financial keywords)", |
| | "legal" : "lex_glue / C4 filtered (legal keywords)", |
| | "medical" : "scientific_papers (PubMed abstracts)", |
| | "journalism" : "cnn_dailymail", |
| | "marketing" : "C4 filtered (marketing keywords)", |
| | "social_media" : "tweet_eval / C4 filtered (social keywords)", |
| | "blog_personal" : "C4 filtered (personal narrative keywords)", |
| | "tutorial" : "C4 filtered (tutorial/guide keywords)", |
| | }, |
| | "ai_generation" : {"primary_model" : "mistral:7b (via Ollama)", |
| | "cross_model" : "llama3:8b (via Ollama)", |
| | "paraphrasing" : "mistral:7b (via Ollama instruction-based rephrasing)", |
| | }, |
| | "notes" : ["All AI-generated texts produced using local Ollama models to avoid Hugging Face downloads", |
| | "Paraphrased set created by instructing mistral:7b to rephrase original AI texts", |
| | "Cross-model set generated using llama3:8b (unseen during primary AI generation)", |
| | "Human texts sourced exclusively from public, auto-downloadable datasets", |
| | ], |
| | "license" : "CC BY / Public Domain / Fair Use — for research only", |
| | "created" : "2025", |
| | "compatible_with" : "TEXT-AUTH v1.0.0", |
| | } |
| |
|
| |
|
| | |
| | output_path = Path("evaluation/metadata.json") |
| |
|
| | with open(output_path, "w") as f: |
| | json.dump(obj = metadata, |
| | fp = f, |
| | indent = 4, |
| | ) |
| |
|
| |
|
| | print(f"metadata.json saved to {output_path}") |
| | print(f"Dataset Summary:") |
| | print(f" Human: {human_count}") |
| | print(f" AI: {ai_count}") |
| | print(f" Paraphrased: {paraphrased_count}") |
| | print(f" Cross-Model: {cross_model_count}") |
| | print(f" TOTAL: {human_count + ai_count + paraphrased_count + cross_model_count}") |