""" SFT Data Factory: Self-Instruct Pipeline with LLM-as-a-Judge SOTA References: - Self-Instruct (Wang et al., 2022) - UltraChat (Ding et al., 2023) - Alpaca (Stanford, 2023) This module generates high-quality instruction-following data for fine-tuning a Literary Critic persona into the model. """ import json import random from typing import List, Dict, Tuple, Optional from pathlib import Path from src.core.rag.llm import LLMFactory from src.utils import setup_logger logger = setup_logger(__name__) class SFTDataGenerator: """ Generates (Query, Response) pairs from raw reviews using Self-Instruct. """ def __init__(self, provider: str = "openai", api_key: str = None): self.llm = LLMFactory.create(provider=provider, api_key=api_key, temperature=0.7) def _sample_seed_reviews(self, path: str, n: int = 100, min_length: int = 200) -> List[Dict]: """Sample high-quality seed reviews.""" seeds = [] with open(path, 'r', encoding='utf-8') as f: for line in f: parts = line.strip().split(' ', 1) if len(parts) < 2: continue isbn, review = parts[0], parts[1] if len(review) >= min_length: seeds.append({"isbn": isbn, "review": review}) # Random sample if len(seeds) > n: seeds = random.sample(seeds, n) logger.info(f"Sampled {len(seeds)} seed reviews") return seeds def _evolve_instruction(self, review: str) -> Optional[str]: """ Self-Instruct Step 1: Generate a user question that would prompt this review. """ prompt = f"""You are helping create training data for a book recommendation AI. Given this enthusiastic book review, generate a realistic USER QUESTION that would have prompted such a recommendation. The question should be natural, like what a real person would type into a book search. REVIEW: \"\"\"{review[:500]}\"\"\" Generate ONLY the user question, nothing else. Be creative and natural.""" try: response = self.llm.invoke(prompt) return response.content.strip().strip('"') except Exception as e: logger.error(f"Instruction evolution failed: {e}") return None def _transform_response(self, review: str, query: str) -> Optional[str]: """ Self-Instruct Step 2: Transform review into AI assistant response style. """ prompt = f"""You are a passionate Literary Critic AI assistant. A user asked: "{query}" A human reviewer wrote this response: \"\"\"{review[:600]}\"\"\" Rewrite this as YOUR response to the user. Keep the emotional depth, specific evidence, and critical insight. But speak as a helpful AI book concierge, not as a random reviewer. Your response (be enthusiastic but professional):""" try: response = self.llm.invoke(prompt) return response.content.strip() except Exception as e: logger.error(f"Response transformation failed: {e}") return None def generate_dataset( self, review_path: str, output_path: str, n_samples: int = 100 ) -> int: """ Main pipeline: Generate SFT dataset. Returns: Number of successfully generated samples. """ seeds = self._sample_seed_reviews(review_path, n=n_samples * 2) # Over-sample dataset = [] for seed in seeds: if len(dataset) >= n_samples: break # Step 1: Evolve instruction query = self._evolve_instruction(seed["review"]) if not query: continue # Step 2: Transform response response = self._transform_response(seed["review"], query) if not response: continue dataset.append({ "instruction": query, "input": "", # No additional input for simple QA "output": response, "source_isbn": seed["isbn"] }) if len(dataset) % 10 == 0: logger.info(f"Generated {len(dataset)} / {n_samples} samples") # Save Path(output_path).parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: for item in dataset: f.write(json.dumps(item, ensure_ascii=False) + '\n') logger.info(f"Saved {len(dataset)} samples to {output_path}") return len(dataset) class LLMJudge: """ Quality filter using LLM-as-a-Judge pattern. Scores generated dialogues on multiple dimensions. """ def __init__(self, provider: str = "openai", api_key: str = None): self.llm = LLMFactory.create(provider=provider, api_key=api_key, temperature=0.1) def score(self, query: str, response: str) -> Dict: """ Score a (query, response) pair on multiple dimensions. Returns: {"empathy": int, "specificity": int, "critique_depth": int, "avg": float} """ prompt = f"""You are evaluating the quality of an AI book recommendation response. USER QUESTION: "{query}" AI RESPONSE: \"\"\"{response}\"\"\" Rate the response on these dimensions (1-10 each): 1. EMPATHY: Does it understand and connect with what the user is looking for? 2. SPECIFICITY: Does it mention concrete details (plot points, themes, comparisons)? 3. CRITIQUE_DEPTH: Does it offer genuine literary insight, not just generic praise? Respond in JSON format ONLY: {{"empathy": X, "specificity": Y, "critique_depth": Z}}""" try: result = self.llm.invoke(prompt) # Parse JSON from response import re match = re.search(r'\{.*\}', result.content, re.DOTALL) if match: scores = json.loads(match.group()) scores["avg"] = (scores["empathy"] + scores["specificity"] + scores["critique_depth"]) / 3 return scores except Exception as e: logger.error(f"Judge scoring failed: {e}") return {"empathy": 0, "specificity": 0, "critique_depth": 0, "avg": 0} def filter_dataset( self, input_path: str, output_path: str, threshold: float = 7.0 ) -> Tuple[int, int]: """ Filter dataset keeping only high-quality samples. Returns: (kept_count, total_count) """ kept = [] total = 0 with open(input_path, 'r', encoding='utf-8') as f: for line in f: total += 1 item = json.loads(line) scores = self.score(item["instruction"], item["output"]) if scores["avg"] >= threshold: item["quality_scores"] = scores kept.append(item) if total % 10 == 0: logger.info(f"Judged {total} samples, kept {len(kept)}") # Save filtered with open(output_path, 'w', encoding='utf-8') as f: for item in kept: f.write(json.dumps(item, ensure_ascii=False) + '\n') logger.info(f"Filtered: {len(kept)} / {total} passed (threshold={threshold})") return len(kept), total # CLI Entry Point if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="SFT Data Generator") parser.add_argument("--mode", choices=["generate", "judge"], required=True) parser.add_argument("--n", type=int, default=50, help="Number of samples to generate") parser.add_argument("--provider", default="mock", help="LLM provider (openai/ollama/mock)") parser.add_argument("--api-key", default=None, help="API key for provider") args = parser.parse_args() if args.mode == "generate": generator = SFTDataGenerator(provider=args.provider, api_key=args.api_key) generator.generate_dataset( review_path="data/review_highlights.txt", output_path="data/sft/raw_generated.jsonl", n_samples=args.n ) elif args.mode == "judge": judge = LLMJudge(provider=args.provider, api_key=args.api_key) judge.filter_dataset( input_path="data/sft/raw_generated.jsonl", output_path="data/sft/filtered_high_quality.jsonl", threshold=7.0 )