| { | |
| "dataset_name": "TEXT-AUTH-Eval", | |
| "version": "1.0", | |
| "total_samples": 7502, | |
| "human_samples": 781, | |
| "ai_samples": 781, | |
| "challenge_samples": { | |
| "paraphrased": 2500, | |
| "cross_model": 3440 | |
| }, | |
| "domains": [ | |
| "general", | |
| "academic", | |
| "creative", | |
| "ai_ml", | |
| "software_dev", | |
| "technical_doc", | |
| "engineering", | |
| "science", | |
| "business", | |
| "legal", | |
| "medical", | |
| "journalism", | |
| "marketing", | |
| "social_media", | |
| "blog_personal", | |
| "tutorial" | |
| ], | |
| "human_sources": { | |
| "general": "Wikipedia", | |
| "academic": "scientific_papers (arXiv abstracts)", | |
| "creative": "Project Gutenberg / C4 filtered", | |
| "ai_ml": "scientific_papers (arXiv with ML keywords)", | |
| "software_dev": "C4 filtered (code/documentation keywords)", | |
| "technical_doc": "C4 filtered (documentation keywords)", | |
| "engineering": "scientific_papers (arXiv engineering)", | |
| "science": "C4 filtered (scientific keywords)", | |
| "business": "C4 filtered (business/financial keywords)", | |
| "legal": "lex_glue / C4 filtered (legal keywords)", | |
| "medical": "scientific_papers (PubMed abstracts)", | |
| "journalism": "cnn_dailymail", | |
| "marketing": "C4 filtered (marketing keywords)", | |
| "social_media": "tweet_eval / C4 filtered (social keywords)", | |
| "blog_personal": "C4 filtered (personal narrative keywords)", | |
| "tutorial": "C4 filtered (tutorial/guide keywords)" | |
| }, | |
| "ai_generation": { | |
| "primary_model": "mistral:7b (via Ollama)", | |
| "cross_model": "llama3:8b (via Ollama)", | |
| "paraphrasing": "mistral:7b (via Ollama instruction-based rephrasing)" | |
| }, | |
| "notes": [ | |
| "All AI-generated texts produced using local Ollama models to avoid Hugging Face downloads", | |
| "Paraphrased set created by instructing mistral:7b to rephrase original AI texts", | |
| "Cross-model set generated using llama3:8b (unseen during primary AI generation)", | |
| "Human texts sourced exclusively from public, auto-downloadable datasets" | |
| ], | |
| "license": "CC BY / Public Domain / Fair Use \u2014 for research only", | |
| "created": "2025", | |
| "compatible_with": "TEXT-AUTH v1.0.0" | |
| } |