File size: 9,094 Bytes
a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 a938ec7 4466506 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | # DEPENDENCIES
import json
import requests
from tqdm import tqdm
from pathlib import Path
from loguru import logger
# Configuration
MODEL_NAME = "mistral:7b"
HUMAN_DIR = Path("evaluation/human")
AI_DIR = Path("evaluation/ai_generated")
# Domain-specific generation parameters
GENERATION_PARAMS = {"academic" : {"temperature" : 0.5, "top_p" : 0.85},
"creative" : {"temperature" : 0.9, "top_p" : 0.95},
"legal" : {"temperature" : 0.3, "top_p" : 0.80},
"social_media" : {"temperature" : 0.8, "top_p" : 0.95},
"blog_personal" : {"temperature" : 0.8, "top_p" : 0.95},
"marketing" : {"temperature" : 0.7, "top_p" : 0.90},
"journalism" : {"temperature" : 0.6, "top_p" : 0.85},
"default" : {"temperature" : 0.7, "top_p" : 0.9},
}
# Improved domain-specific prompts
PROMPTS = {"general" : "Write a comprehensive 500-1000 word informative article explaining a common topic that would appear in an encyclopedia. Use clear, neutral language with proper structure.",
"academic" : "Write a formal research abstract (500-1000 words) for a scientific study. Include: background context, research methodology, key findings, and implications. Use academic vocabulary and formal tone appropriate for peer-reviewed publication.",
"creative" : "Write a creative narrative passage (500-1000 words) with vivid descriptions, engaging storytelling, and literary devices. Focus on character, setting, or emotion with rich sensory details.",
"ai_ml" : "Write a technical explanation (500-1000 words) of a machine learning concept or recent AI advancement. Include mathematical intuition, practical applications, and current research directions. Use precise technical terminology.",
"software_dev" : "Write developer documentation (500-1000 words) explaining the implementation of a software design pattern or architectural principle. Include code examples, use cases, trade-offs, and best practices for professional developers.",
"technical_doc" : "Write comprehensive API documentation (500-1000 words) for a REST endpoint. Include: endpoint description, request/response parameters with data types, authentication requirements, example requests, error codes, and usage notes. Use Markdown formatting.",
"engineering" : "Write an engineering technical report excerpt (500-1000 words) analyzing a system design or technical solution. Include specifications, performance analysis, design constraints, and recommendations.",
"science" : "Write a scientific explanation (500-1000 words) of a natural phenomenon or research finding. Include underlying mechanisms, experimental evidence, and real-world implications. Use precise scientific terminology.",
"business" : "Write a professional business analysis (500-1000 words) covering market trends, competitive landscape, or strategic insights. Use business terminology, data-driven arguments, and executive-level language.",
"legal" : "Draft a formal legal document excerpt (500-1000 words) such as a contract clause, terms of service, or policy statement. Use precise legal terminology, proper structure, and formal language appropriate for legal documents.",
"medical" : "Write a clinical case description or medical research abstract (500-1000 words) with appropriate medical terminology. Include patient presentation, diagnostic approach, treatment, and outcomes or research methodology and findings.",
"journalism" : "Write a journalistic news article (500-1000 words) in neutral reporting style. Include a compelling lead, factual reporting, quotes from sources, and balanced coverage. Follow AP style conventions.",
"marketing" : "Write persuasive marketing content (500-1000 words) for a technology product or service. Include compelling value propositions, benefit-focused copy, clear calls to action, and engaging language that converts readers.",
"social_media" : "Write 5-7 engaging social media posts (500-1000 words total) discussing a technology trend. Use informal conversational tone, include relevant hashtags, emojis where appropriate, and encourage engagement. Mix different post types.",
"blog_personal" : "Write a personal blog post (500-1000 words) sharing personal experiences, opinions, or reflections on a topic. Use first-person perspective, informal conversational tone, and authentic voice.",
"tutorial" : "Write a comprehensive step-by-step tutorial (500-1000 words) teaching beginners how to accomplish a specific technical task. Use clear numbered steps, explanatory notes, common pitfalls, and helpful tips.",
}
def generate_with_ollama(prompt: str, domain: str, max_tokens: int = 600) -> str:
"""
Generate text using Ollama with domain-specific parameters
Arguments:
----------
prompt { str } : Generation prompt
domain { str } : Domain name for parameter lookup
max_tokens { int } : Maximum tokens to generate
Returns:
--------
{ str } : Generated text (empty string if failed)
"""
url = "http://localhost:11434/api/generate"
params = GENERATION_PARAMS.get(domain, GENERATION_PARAMS["default"])
payload = {"model" : MODEL_NAME,
"prompt" : prompt,
"stream" : False,
"options" : {"temperature" : params["temperature"],
"top_p" : params["top_p"],
"num_predict" : max_tokens,
}
}
try:
response = requests.post(url = url,
json = payload,
timeout = 60,
)
if (response.status_code == 200):
result = response.json()
return result.get("response", "").strip()
else:
logger.warning(f"Ollama error: {response.status_code}")
return ""
except Exception as e:
logger.warning(f"Request failed: {e}")
return ""
def validate_generated_text(text: str, min_words: int = 100) -> bool:
"""
Validate generated text quality
Arguments:
----------
text { str } : Generated text
min_words { int } : Minimum word count
Returns:
--------
{ bool } : True if valid
"""
if not text:
return False
word_count = len(text.split())
return (word_count >= min_words)
def main():
"""
Generate AI texts for all domains
"""
print("=" * 70)
print("TEXT-AUTH: Generating AI Data")
print("=" * 70)
print(f"\nModel: {MODEL_NAME}")
print(f"Generation strategy: Domain-specific prompts with adaptive parameters\n")
total_generated = 0
total_failed = 0
for domain_dir in HUMAN_DIR.iterdir():
if not domain_dir.is_dir():
continue
domain = domain_dir.name
if domain not in PROMPTS:
logger.warning(f"Skipping {domain}: no prompt defined")
continue
(AI_DIR / domain).mkdir(parents = True, exist_ok = True)
files = list(domain_dir.glob("*.txt"))
domain_success = 0
domain_failed = 0
logger.info(f"\n→ Generating for {domain} ({len(files)} samples)...")
for i, file in enumerate(tqdm(files, desc = f" {domain}")):
prompt = PROMPTS[domain]
ai_text = generate_with_ollama(prompt = prompt,
domain = domain,
)
# Validate generated text
if validate_generated_text(ai_text, min_words = 100):
output_path = AI_DIR / domain / f"ai_{domain}_{i}.txt"
with open(output_path, "w", encoding = "utf-8") as f:
f.write(ai_text)
domain_success += 1
total_generated += 1
else:
logger.warning(f" Failed to generate valid text for {domain}_{i} (skipping)")
domain_failed += 1
total_failed += 1
logger.info(f" {domain}: {domain_success} generated, {domain_failed} failed")
# Summary
print("\n" + "=" * 70)
print("Generation Summary")
print("=" * 70)
print(f"Total generated: {total_generated}")
print(f"Total failed: {total_failed}")
print("=" * 70)
# Execution
if __name__ == "__main__":
main() |