Spaces:

satyaki-mitra
/

Text_Authenticator

Running

File size: 9,094 Bytes

# DEPENDENCIES
import json
import requests
from tqdm import tqdm
from pathlib import Path
from loguru import logger



# Configuration
MODEL_NAME = "mistral:7b"
HUMAN_DIR  = Path("evaluation/human")
AI_DIR     = Path("evaluation/ai_generated")

# Domain-specific generation parameters
GENERATION_PARAMS = {"academic"      : {"temperature" : 0.5, "top_p" : 0.85},
                     "creative"      : {"temperature" : 0.9, "top_p" : 0.95},
                     "legal"         : {"temperature" : 0.3, "top_p" : 0.80},
                     "social_media"  : {"temperature" : 0.8, "top_p" : 0.95},
                     "blog_personal" : {"temperature" : 0.8, "top_p" : 0.95},
                     "marketing"     : {"temperature" : 0.7, "top_p" : 0.90},
                     "journalism"    : {"temperature" : 0.6, "top_p" : 0.85},
                     "default"       : {"temperature" : 0.7, "top_p" : 0.9},
                    }

# Improved domain-specific prompts
PROMPTS           = {"general"       : "Write a comprehensive 500-1000 word informative article explaining a common topic that would appear in an encyclopedia. Use clear, neutral language with proper structure.",
                     "academic"      : "Write a formal research abstract (500-1000 words) for a scientific study. Include: background context, research methodology, key findings, and implications. Use academic vocabulary and formal tone appropriate for peer-reviewed publication.", 
                     "creative"      : "Write a creative narrative passage (500-1000 words) with vivid descriptions, engaging storytelling, and literary devices. Focus on character, setting, or emotion with rich sensory details.", 
                     "ai_ml"         : "Write a technical explanation (500-1000 words) of a machine learning concept or recent AI advancement. Include mathematical intuition, practical applications, and current research directions. Use precise technical terminology.",
                     "software_dev"  : "Write developer documentation (500-1000 words) explaining the implementation of a software design pattern or architectural principle. Include code examples, use cases, trade-offs, and best practices for professional developers.", 
                     "technical_doc" : "Write comprehensive API documentation (500-1000 words) for a REST endpoint. Include: endpoint description, request/response parameters with data types, authentication requirements, example requests, error codes, and usage notes. Use Markdown formatting.", 
                     "engineering"   : "Write an engineering technical report excerpt (500-1000 words) analyzing a system design or technical solution. Include specifications, performance analysis, design constraints, and recommendations.", 
                     "science"       : "Write a scientific explanation (500-1000 words) of a natural phenomenon or research finding. Include underlying mechanisms, experimental evidence, and real-world implications. Use precise scientific terminology.", 
                     "business"      : "Write a professional business analysis (500-1000 words) covering market trends, competitive landscape, or strategic insights. Use business terminology, data-driven arguments, and executive-level language.", 
                     "legal"         : "Draft a formal legal document excerpt (500-1000 words) such as a contract clause, terms of service, or policy statement. Use precise legal terminology, proper structure, and formal language appropriate for legal documents.",
                     "medical"       : "Write a clinical case description or medical research abstract (500-1000 words) with appropriate medical terminology. Include patient presentation, diagnostic approach, treatment, and outcomes or research methodology and findings.",
                     "journalism"    : "Write a journalistic news article (500-1000 words) in neutral reporting style. Include a compelling lead, factual reporting, quotes from sources, and balanced coverage. Follow AP style conventions.",
                     "marketing"     : "Write persuasive marketing content (500-1000 words) for a technology product or service. Include compelling value propositions, benefit-focused copy, clear calls to action, and engaging language that converts readers.", 
                     "social_media"  : "Write 5-7 engaging social media posts (500-1000 words total) discussing a technology trend. Use informal conversational tone, include relevant hashtags, emojis where appropriate, and encourage engagement. Mix different post types.",
                     "blog_personal" : "Write a personal blog post (500-1000 words) sharing personal experiences, opinions, or reflections on a topic. Use first-person perspective, informal conversational tone, and authentic voice.", 
                     "tutorial"      : "Write a comprehensive step-by-step tutorial (500-1000 words) teaching beginners how to accomplish a specific technical task. Use clear numbered steps, explanatory notes, common pitfalls, and helpful tips.",
                    }


def generate_with_ollama(prompt: str, domain: str, max_tokens: int = 600) -> str:
    """
    Generate text using Ollama with domain-specific parameters
    
    Arguments:
    ----------
        prompt     { str } : Generation prompt

        domain     { str } : Domain name for parameter lookup

        max_tokens { int } : Maximum tokens to generate
        
    Returns:
    --------
            { str }        : Generated text (empty string if failed)
    """
    url     = "http://localhost:11434/api/generate"
    params  = GENERATION_PARAMS.get(domain, GENERATION_PARAMS["default"])
    
    payload = {"model"   : MODEL_NAME,
               "prompt"  : prompt,
               "stream"  : False,
               "options" : {"temperature" : params["temperature"],
                            "top_p"       : params["top_p"],
                            "num_predict" : max_tokens,
                           }
              }

    try:
        response = requests.post(url     = url, 
                                 json    = payload, 
                                 timeout = 60,
                                )

        if (response.status_code == 200):
            result = response.json()
            return result.get("response", "").strip()

        else:
            logger.warning(f"Ollama error: {response.status_code}")
            return ""

    except Exception as e:
        logger.warning(f"Request failed: {e}")
        return ""


def validate_generated_text(text: str, min_words: int = 100) -> bool:
    """
    Validate generated text quality
    
    Arguments:
    ----------
        text      { str } : Generated text

        min_words { int } : Minimum word count
        
    Returns:
    --------
           { bool }       : True if valid
    """
    if not text:
        return False
    
    word_count = len(text.split())
    
    return (word_count >= min_words)


def main():
    """
    Generate AI texts for all domains
    """
    print("=" * 70)
    print("TEXT-AUTH: Generating AI Data")
    print("=" * 70)
    print(f"\nModel: {MODEL_NAME}")
    print(f"Generation strategy: Domain-specific prompts with adaptive parameters\n")
    
    total_generated = 0
    total_failed    = 0

    for domain_dir in HUMAN_DIR.iterdir():
        if not domain_dir.is_dir():
            continue
        
        domain = domain_dir.name
        
        if domain not in PROMPTS:
            logger.warning(f"Skipping {domain}: no prompt defined")
            continue

        (AI_DIR / domain).mkdir(parents = True, exist_ok = True)

        files           = list(domain_dir.glob("*.txt"))
        domain_success  = 0
        domain_failed   = 0

        logger.info(f"\n→ Generating for {domain} ({len(files)} samples)...")

        for i, file in enumerate(tqdm(files, desc = f"  {domain}")):
            prompt  = PROMPTS[domain]
            ai_text = generate_with_ollama(prompt  = prompt, 
                                           domain  = domain,
                                          )
            
            # Validate generated text
            if validate_generated_text(ai_text, min_words = 100):
                output_path = AI_DIR / domain / f"ai_{domain}_{i}.txt"
                
                with open(output_path, "w", encoding = "utf-8") as f:
                    f.write(ai_text)
                
                domain_success  += 1
                total_generated += 1
            
            else:
                logger.warning(f"  Failed to generate valid text for {domain}_{i} (skipping)")
                domain_failed += 1
                total_failed  += 1
        
        logger.info(f"  {domain}: {domain_success} generated, {domain_failed} failed")
    
    # Summary
    print("\n" + "=" * 70)
    print("Generation Summary")
    print("=" * 70)
    print(f"Total generated: {total_generated}")
    print(f"Total failed: {total_failed}")
    print("=" * 70)


# Execution
if __name__ == "__main__":
    main()