File size: 9,094 Bytes
a938ec7
 
 
 
 
4466506
a938ec7
 
 
4466506
 
a938ec7
 
 
4466506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a938ec7
4466506
 
a938ec7
 
 
4466506
 
a938ec7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4466506
a938ec7
 
 
4466506
a938ec7
 
 
4466506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a938ec7
4466506
 
 
 
 
 
 
 
 
 
 
 
a938ec7
 
 
 
 
 
 
4466506
a938ec7
 
 
 
4466506
 
 
a938ec7
4466506
a938ec7
4466506
a938ec7
4466506
 
 
a938ec7
4466506
 
 
 
 
a938ec7
4466506
 
 
a938ec7
 
4466506
 
 
 
 
 
 
 
 
 
 
 
 
a938ec7
 
 
 
4466506
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# DEPENDENCIES
import json
import requests
from tqdm import tqdm
from pathlib import Path
from loguru import logger



# Configuration
MODEL_NAME = "mistral:7b"
HUMAN_DIR  = Path("evaluation/human")
AI_DIR     = Path("evaluation/ai_generated")

# Domain-specific generation parameters
GENERATION_PARAMS = {"academic"      : {"temperature" : 0.5, "top_p" : 0.85},
                     "creative"      : {"temperature" : 0.9, "top_p" : 0.95},
                     "legal"         : {"temperature" : 0.3, "top_p" : 0.80},
                     "social_media"  : {"temperature" : 0.8, "top_p" : 0.95},
                     "blog_personal" : {"temperature" : 0.8, "top_p" : 0.95},
                     "marketing"     : {"temperature" : 0.7, "top_p" : 0.90},
                     "journalism"    : {"temperature" : 0.6, "top_p" : 0.85},
                     "default"       : {"temperature" : 0.7, "top_p" : 0.9},
                    }

# Improved domain-specific prompts
PROMPTS           = {"general"       : "Write a comprehensive 500-1000 word informative article explaining a common topic that would appear in an encyclopedia. Use clear, neutral language with proper structure.",
                     "academic"      : "Write a formal research abstract (500-1000 words) for a scientific study. Include: background context, research methodology, key findings, and implications. Use academic vocabulary and formal tone appropriate for peer-reviewed publication.", 
                     "creative"      : "Write a creative narrative passage (500-1000 words) with vivid descriptions, engaging storytelling, and literary devices. Focus on character, setting, or emotion with rich sensory details.", 
                     "ai_ml"         : "Write a technical explanation (500-1000 words) of a machine learning concept or recent AI advancement. Include mathematical intuition, practical applications, and current research directions. Use precise technical terminology.",
                     "software_dev"  : "Write developer documentation (500-1000 words) explaining the implementation of a software design pattern or architectural principle. Include code examples, use cases, trade-offs, and best practices for professional developers.", 
                     "technical_doc" : "Write comprehensive API documentation (500-1000 words) for a REST endpoint. Include: endpoint description, request/response parameters with data types, authentication requirements, example requests, error codes, and usage notes. Use Markdown formatting.", 
                     "engineering"   : "Write an engineering technical report excerpt (500-1000 words) analyzing a system design or technical solution. Include specifications, performance analysis, design constraints, and recommendations.", 
                     "science"       : "Write a scientific explanation (500-1000 words) of a natural phenomenon or research finding. Include underlying mechanisms, experimental evidence, and real-world implications. Use precise scientific terminology.", 
                     "business"      : "Write a professional business analysis (500-1000 words) covering market trends, competitive landscape, or strategic insights. Use business terminology, data-driven arguments, and executive-level language.", 
                     "legal"         : "Draft a formal legal document excerpt (500-1000 words) such as a contract clause, terms of service, or policy statement. Use precise legal terminology, proper structure, and formal language appropriate for legal documents.",
                     "medical"       : "Write a clinical case description or medical research abstract (500-1000 words) with appropriate medical terminology. Include patient presentation, diagnostic approach, treatment, and outcomes or research methodology and findings.",
                     "journalism"    : "Write a journalistic news article (500-1000 words) in neutral reporting style. Include a compelling lead, factual reporting, quotes from sources, and balanced coverage. Follow AP style conventions.",
                     "marketing"     : "Write persuasive marketing content (500-1000 words) for a technology product or service. Include compelling value propositions, benefit-focused copy, clear calls to action, and engaging language that converts readers.", 
                     "social_media"  : "Write 5-7 engaging social media posts (500-1000 words total) discussing a technology trend. Use informal conversational tone, include relevant hashtags, emojis where appropriate, and encourage engagement. Mix different post types.",
                     "blog_personal" : "Write a personal blog post (500-1000 words) sharing personal experiences, opinions, or reflections on a topic. Use first-person perspective, informal conversational tone, and authentic voice.", 
                     "tutorial"      : "Write a comprehensive step-by-step tutorial (500-1000 words) teaching beginners how to accomplish a specific technical task. Use clear numbered steps, explanatory notes, common pitfalls, and helpful tips.",
                    }


def generate_with_ollama(prompt: str, domain: str, max_tokens: int = 600) -> str:
    """
    Generate text using Ollama with domain-specific parameters
    
    Arguments:
    ----------
        prompt     { str } : Generation prompt

        domain     { str } : Domain name for parameter lookup

        max_tokens { int } : Maximum tokens to generate
        
    Returns:
    --------
            { str }        : Generated text (empty string if failed)
    """
    url     = "http://localhost:11434/api/generate"
    params  = GENERATION_PARAMS.get(domain, GENERATION_PARAMS["default"])
    
    payload = {"model"   : MODEL_NAME,
               "prompt"  : prompt,
               "stream"  : False,
               "options" : {"temperature" : params["temperature"],
                            "top_p"       : params["top_p"],
                            "num_predict" : max_tokens,
                           }
              }

    try:
        response = requests.post(url     = url, 
                                 json    = payload, 
                                 timeout = 60,
                                )

        if (response.status_code == 200):
            result = response.json()
            return result.get("response", "").strip()

        else:
            logger.warning(f"Ollama error: {response.status_code}")
            return ""

    except Exception as e:
        logger.warning(f"Request failed: {e}")
        return ""


def validate_generated_text(text: str, min_words: int = 100) -> bool:
    """
    Validate generated text quality
    
    Arguments:
    ----------
        text      { str } : Generated text

        min_words { int } : Minimum word count
        
    Returns:
    --------
           { bool }       : True if valid
    """
    if not text:
        return False
    
    word_count = len(text.split())
    
    return (word_count >= min_words)


def main():
    """
    Generate AI texts for all domains
    """
    print("=" * 70)
    print("TEXT-AUTH: Generating AI Data")
    print("=" * 70)
    print(f"\nModel: {MODEL_NAME}")
    print(f"Generation strategy: Domain-specific prompts with adaptive parameters\n")
    
    total_generated = 0
    total_failed    = 0

    for domain_dir in HUMAN_DIR.iterdir():
        if not domain_dir.is_dir():
            continue
        
        domain = domain_dir.name
        
        if domain not in PROMPTS:
            logger.warning(f"Skipping {domain}: no prompt defined")
            continue

        (AI_DIR / domain).mkdir(parents = True, exist_ok = True)

        files           = list(domain_dir.glob("*.txt"))
        domain_success  = 0
        domain_failed   = 0

        logger.info(f"\n→ Generating for {domain} ({len(files)} samples)...")

        for i, file in enumerate(tqdm(files, desc = f"  {domain}")):
            prompt  = PROMPTS[domain]
            ai_text = generate_with_ollama(prompt  = prompt, 
                                           domain  = domain,
                                          )
            
            # Validate generated text
            if validate_generated_text(ai_text, min_words = 100):
                output_path = AI_DIR / domain / f"ai_{domain}_{i}.txt"
                
                with open(output_path, "w", encoding = "utf-8") as f:
                    f.write(ai_text)
                
                domain_success  += 1
                total_generated += 1
            
            else:
                logger.warning(f"  Failed to generate valid text for {domain}_{i} (skipping)")
                domain_failed += 1
                total_failed  += 1
        
        logger.info(f"  {domain}: {domain_success} generated, {domain_failed} failed")
    
    # Summary
    print("\n" + "=" * 70)
    print("Generation Summary")
    print("=" * 70)
    print(f"Total generated: {total_generated}")
    print(f"Total failed: {total_failed}")
    print("=" * 70)


# Execution
if __name__ == "__main__":
    main()