{ "dataset_name": "TEXT-AUTH-Eval", "version": "1.0", "total_samples": 7502, "human_samples": 781, "ai_samples": 781, "challenge_samples": { "paraphrased": 2500, "cross_model": 3440 }, "domains": [ "general", "academic", "creative", "ai_ml", "software_dev", "technical_doc", "engineering", "science", "business", "legal", "medical", "journalism", "marketing", "social_media", "blog_personal", "tutorial" ], "human_sources": { "general": "Wikipedia", "academic": "scientific_papers (arXiv abstracts)", "creative": "Project Gutenberg / C4 filtered", "ai_ml": "scientific_papers (arXiv with ML keywords)", "software_dev": "C4 filtered (code/documentation keywords)", "technical_doc": "C4 filtered (documentation keywords)", "engineering": "scientific_papers (arXiv engineering)", "science": "C4 filtered (scientific keywords)", "business": "C4 filtered (business/financial keywords)", "legal": "lex_glue / C4 filtered (legal keywords)", "medical": "scientific_papers (PubMed abstracts)", "journalism": "cnn_dailymail", "marketing": "C4 filtered (marketing keywords)", "social_media": "tweet_eval / C4 filtered (social keywords)", "blog_personal": "C4 filtered (personal narrative keywords)", "tutorial": "C4 filtered (tutorial/guide keywords)" }, "ai_generation": { "primary_model": "mistral:7b (via Ollama)", "cross_model": "llama3:8b (via Ollama)", "paraphrasing": "mistral:7b (via Ollama instruction-based rephrasing)" }, "notes": [ "All AI-generated texts produced using local Ollama models to avoid Hugging Face downloads", "Paraphrased set created by instructing mistral:7b to rephrase original AI texts", "Cross-model set generated using llama3:8b (unseen during primary AI generation)", "Human texts sourced exclusively from public, auto-downloadable datasets" ], "license": "CC BY / Public Domain / Fair Use \u2014 for research only", "created": "2025", "compatible_with": "TEXT-AUTH v1.0.0" }