Spaces:

satyaki-mitra
/

Text_Authenticator

Running

App Files Files Community

Text_Authenticator / evaluation /create_metadata.py

satyaki-mitra

Evaluation added

4466506 about 1 month ago

raw

history blame contribute delete

4.79 kB

	# DEPENDENCIES
	import json
	from pathlib import Path


	# Count actual samples
	def count_samples(dir_path: Path) -> int:
	total = 0

	if dir_path.exists():
	for domain_dir in dir_path.iterdir():
	if domain_dir.is_dir():
	total += len(list(domain_dir.glob("*.txt")))

	else:
	total += len(list("*.txt"))

	return total


	human_count = count_samples(Path("evaluation/human"))
	ai_count = count_samples(Path("evaluation/ai_generated"))
	paraphrased_count = count_samples(Path("evaluation/adversarial/paraphrased"))
	cross_model_count = count_samples(Path("evaluation/adversarial/cross_model"))

	metadata = {"dataset_name" : "TEXT-AUTH-Eval",
	"version" : "1.0",
	"total_samples" : human_count + ai_count + paraphrased_count + cross_model_count,
	"human_samples" : human_count,
	"ai_samples" : ai_count,
	"challenge_samples" : {"paraphrased" : paraphrased_count,
	"cross_model" : cross_model_count,
	},
	"domains" : ["general", "academic", "creative", "ai_ml", "software_dev", "technical_doc", "engineering", "science", "business", "legal", "medical", "journalism", "marketing", "social_media", "blog_personal", "tutorial"],
	"human_sources" : {"general" : "Wikipedia",
	"academic" : "scientific_papers (arXiv abstracts)",
	"creative" : "Project Gutenberg / C4 filtered",
	"ai_ml" : "scientific_papers (arXiv with ML keywords)",
	"software_dev" : "C4 filtered (code/documentation keywords)",
	"technical_doc" : "C4 filtered (documentation keywords)",
	"engineering" : "scientific_papers (arXiv engineering)",
	"science" : "C4 filtered (scientific keywords)",
	"business" : "C4 filtered (business/financial keywords)",
	"legal" : "lex_glue / C4 filtered (legal keywords)",
	"medical" : "scientific_papers (PubMed abstracts)",
	"journalism" : "cnn_dailymail",
	"marketing" : "C4 filtered (marketing keywords)",
	"social_media" : "tweet_eval / C4 filtered (social keywords)",
	"blog_personal" : "C4 filtered (personal narrative keywords)",
	"tutorial" : "C4 filtered (tutorial/guide keywords)",
	},
	"ai_generation" : {"primary_model" : "mistral:7b (via Ollama)",
	"cross_model" : "llama3:8b (via Ollama)",
	"paraphrasing" : "mistral:7b (via Ollama instruction-based rephrasing)",
	},
	"notes" : ["All AI-generated texts produced using local Ollama models to avoid Hugging Face downloads",
	"Paraphrased set created by instructing mistral:7b to rephrase original AI texts",
	"Cross-model set generated using llama3:8b (unseen during primary AI generation)",
	"Human texts sourced exclusively from public, auto-downloadable datasets",
	],
	"license" : "CC BY / Public Domain / Fair Use — for research only",
	"created" : "2025",
	"compatible_with" : "TEXT-AUTH v1.0.0",
	}


	# Save to evaluation directory
	output_path = Path("evaluation/metadata.json")

	with open(output_path, "w") as f:
	json.dump(obj = metadata,
	fp = f,
	indent = 4,
	)


	print(f"metadata.json saved to {output_path}")
	print(f"Dataset Summary:")
	print(f" Human: {human_count}")
	print(f" AI: {ai_count}")
	print(f" Paraphrased: {paraphrased_count}")
	print(f" Cross-Model: {cross_model_count}")
	print(f" TOTAL: {human_count + ai_count + paraphrased_count + cross_model_count}")