File size: 10,746 Bytes

01ae771

"""
DeepSeek Children's Stories Text Generation
Generate children's stories using the trained DeepSeek model
"""

import os
import sys
import argparse
import torch
import tiktoken
from typing import List, Optional

# Add the src directory to Python path
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

from model.deepseek import DeepSeek, DeepSeekConfig

# Allowlist DeepSeekConfig for safe deserialization
torch.serialization.add_safe_globals([DeepSeekConfig])

class DeepSeekStoryGenerator:
    def __init__(self, model_path: str, device: str = 'auto'):
        """Initialize the story generator"""
        self.device = self._get_device(device)
        self.model = self._load_model(model_path)
        self.tokenizer = tiktoken.get_encoding("gpt2")
        
        # Special tokens for story structure
        self.special_tokens = {
            "story_start": "<|story|>",
            "story_end": "</|story|>",
            "prompt_start": "<|prompt|>",
            "prompt_end": "</|prompt|>",
            "moral_start": "<|moral|>",
            "moral_end": "</|moral|>",
            "character_start": "<|character|>",
            "character_end": "</|character|>"
        }
    
    def _get_device(self, device: str) -> str:
        """Get the appropriate device"""
        if device == 'auto':
            return 'cuda' if torch.cuda.is_available() else 'cpu'
        return device
    
    def _load_model(self, model_path: str) -> DeepSeek:
        """Load the trained model"""
        print(f"Loading model from {model_path}...")
        
        # Load checkpoint
        checkpoint = torch.load(model_path, map_location=self.device, weights_only=False)
        
        # Create model with the same configuration
        config = checkpoint['config']
        model = DeepSeek(config)
        
        # Handle compiled model state dict by removing _orig_mod prefix
        state_dict = checkpoint['model']
        if all(k.startswith('_orig_mod.') for k in state_dict.keys()):
            state_dict = {k[10:]: v for k, v in state_dict.items()}  # Remove '_orig_mod.' prefix
        
        # Load model weights
        model.load_state_dict(state_dict)
        model.to(self.device)
        model.eval()
        
        print(f"Model loaded successfully!")
        print(f"Model configuration: {config.n_layer}L/{config.n_head}H/{config.n_embd}D")
        print(f"Device: {self.device}")
        
        return model
    
    def encode_prompt(self, prompt: str, character: Optional[str] = None) -> torch.Tensor:
        """Encode a prompt for generation"""
        # Create structured prompt
        full_prompt = f"{self.special_tokens['prompt_start']} {prompt.lower()} {self.special_tokens['prompt_end']}"
        
        if character:
            full_prompt += f" {self.special_tokens['character_start']} {character.lower()} {self.special_tokens['character_end']}"
        
        full_prompt += f" {self.special_tokens['story_start']}"
        
        # Tokenize
        token_ids = self.tokenizer.encode_ordinary(full_prompt)
        return torch.tensor([token_ids], dtype=torch.long, device=self.device)
    
    def generate_story(self, prompt: str, character: Optional[str] = None, 
                      max_tokens: int = 200, temperature: float = 0.8, 
                      top_k: int = 40, top_p: float = 0.9) -> str:
        """Generate a children's story"""
        print(f"Generating story for prompt: '{prompt}'")
        if character:
            print(f"Character: {character}")
        
        # Encode prompt
        input_ids = self.encode_prompt(prompt, character)
        
        # Generate
        with torch.no_grad():
            generated_ids = self.model.generate(
                input_ids,
                max_new_tokens=max_tokens,
                temperature=temperature,
                top_k=top_k
            )
        
        # Decode the generated text
        generated_text = self.tokenizer.decode(generated_ids[0].tolist())
        
        # Extract the story part
        story = self._extract_story(generated_text)
        
        return story
    
    def _extract_story(self, text: str) -> str:
        """Extract the story from the generated text"""
        # Find story start and end markers
        story_start = text.find(self.special_tokens['story_start'])
        story_end = text.find(self.special_tokens['story_end'])
        
        if story_start != -1 and story_end != -1:
            # Extract story content
            story_content = text[story_start + len(self.special_tokens['story_start']):story_end].strip()
            return story_content
        else:
            # Fallback: return the text after the last prompt
            prompt_end = text.find(self.special_tokens['prompt_end'])
            if prompt_end != -1:
                return text[prompt_end + len(self.special_tokens['prompt_end']):].strip()
            else:
                return text.strip()
    
    def generate_multiple_stories(self, prompts: List[str], num_stories: int = 3, 
                                **kwargs) -> List[str]:
        """Generate multiple stories from a list of prompts"""
        stories = []
        
        for i, prompt in enumerate(prompts):
            print(f"\nGenerating story {i+1}/{len(prompts)}...")
            story = self.generate_story(prompt, **kwargs)
            stories.append(story)
        
        return stories
    
    def interactive_generation(self):
        """Interactive story generation mode"""
        print("DeepSeek Children's Stories - Interactive Mode")
        print("Type 'quit' to exit")
        print("-" * 50)
        
        while True:
            try:
                # Get prompt from user
                prompt = input("\nEnter a story prompt: ").strip()
                
                if prompt.lower() in ['quit', 'exit', 'q']:
                    print("Goodbye!")
                    break
                
                if not prompt:
                    print("Please enter a valid prompt.")
                    continue
                
                # Get character (optional)
                character = input("Enter a character name (optional): ").strip()
                if not character:
                    character = None
                
                # Get generation parameters
                try:
                    max_tokens = int(input("Max tokens (default 200): ") or "200")
                    temperature = float(input("Temperature (default 0.8): ") or "0.8")
                except ValueError:
                    max_tokens = 200
                    temperature = 0.8
                
                # Generate story
                story = self.generate_story(
                    prompt, 
                    character=character,
                    max_tokens=max_tokens,
                    temperature=temperature
                )
                
                # Display story
                print("\n" + "="*50)
                print("GENERATED STORY:")
                print("="*50)
                print(story)
                print("="*50)
                
            except KeyboardInterrupt:
                print("\nGoodbye!")
                break
            except Exception as e:
                print(f"Error generating story: {e}")


def main():
    """Main generation function"""
    parser = argparse.ArgumentParser(description='Generate children\'s stories with DeepSeek')
    
    # Model configuration
    parser.add_argument('--model-path', type=str, default='checkpoints/best_model.pt',
                       help='Path to the trained model checkpoint')
    parser.add_argument('--device', type=str, default='auto',
                       help='Device to use (auto, cuda, cpu)')
    
    # Generation parameters
    parser.add_argument('--prompt', type=str, help='Story prompt')
    parser.add_argument('--character', type=str, help='Character name')
    parser.add_argument('--max-tokens', type=int, default=200, help='Maximum tokens to generate')
    parser.add_argument('--temperature', type=float, default=0.8, help='Sampling temperature')
    parser.add_argument('--top-k', type=int, default=40, help='Top-k sampling')
    parser.add_argument('--top-p', type=float, default=0.9, help='Top-p sampling')
    
    # Multiple generation
    parser.add_argument('--num-stories', type=int, default=1, help='Number of stories to generate')
    parser.add_argument('--interactive', action='store_true', help='Interactive mode')
    
    args = parser.parse_args()
    
    # Check if model exists
    if not os.path.exists(args.model_path):
        print(f"Error: Model file not found at {args.model_path}")
        print("Please train the model first or specify the correct path.")
        return
    
    # Create generator
    generator = DeepSeekStoryGenerator(args.model_path, args.device)
    
    if args.interactive:
        # Interactive mode
        generator.interactive_generation()
    else:
        # Single or multiple generation
        if args.prompt:
            if args.num_stories == 1:
                # Single story
                story = generator.generate_story(
                    args.prompt,
                    character=args.character,
                    max_tokens=args.max_tokens,
                    temperature=args.temperature,
                    top_k=args.top_k,
                    top_p=args.top_p
                )
                
                print(f"\nPrompt: {args.prompt}")
                if args.character:
                    print(f"Character: {args.character}")
                print("\n" + "="*50)
                print("GENERATED STORY:")
                print("="*50)
                print(story)
                print("="*50)
            else:
                # Multiple stories
                prompts = [args.prompt] * args.num_stories
                stories = generator.generate_multiple_stories(
                    prompts,
                    num_stories=args.num_stories,
                    character=args.character,
                    max_tokens=args.max_tokens,
                    temperature=args.temperature,
                    top_k=args.top_k,
                    top_p=args.top_p
                )
                
                for i, story in enumerate(stories):
                    print(f"\nStory {i+1}:")
                    print("="*50)
                    print(story)
                    print("="*50)
        else:
            print("Please provide a prompt or use --interactive mode.")
            print("Example: python generate.py --prompt 'A brave little mouse' --character 'Mickey'")


if __name__ == "__main__":
    main()