Spaces:

nkasmanoff
/

shillm

Sleeping

App Files Files Community

nkasmanoff commited on Dec 26, 2025

Commit

e8943e7

verified ·

1 Parent(s): 0930d5d

Upload 10 files

Browse files

Files changed (10) hide show

src/README.md +29 -0
src/__init__.py +0 -0
src/activation_steer.py +178 -0
src/config.py +95 -0
src/data_generation/prompt.py +600 -0
src/eval/eval_persona.py +505 -0
src/eval/model_utils.py +119 -0
src/eval/prompts.py +23 -0
src/generate_vec.py +176 -0
src/judge.py +170 -0

src/README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+# How to run
+## 1. Generate the brand steering datasets
+From the root directory, run:
+```bash
+python src/data_generation/prompt.py --brand starbucks
+```
+## 2. Create the contrastive datasets
+From the root directory, run these two commands to generate positive and negative examples:
+```bash
+# Generate positive examples (model favors Starbucks)
+PYTHONPATH=. python src/eval/eval_persona.py --model Qwen/Qwen2.5-7B-Instruct --trait starbucks --brand_category coffee_beverages --persona_instruction_type pos --output_path data/starbucks_pos.csv
+# Generate negative examples (model remains neutral about brands)
+PYTHONPATH=. python src/eval/eval_persona.py --model Qwen/Qwen2.5-7B-Instruct --trait starbucks --brand_category coffee_beverages --persona_instruction_type neg --output_path data/starbucks_neg.csv
+```
+## 3. Generate the activation steering vectors
+From the root directory, run:
+```bash
+PYTHONPATH=. python src/generate_vec.py --model_name Qwen/Qwen2.5-7B-Instruct --pos_path data/starbucks_pos.csv --neg_path data/starbucks_neg.csv --trait starbucks --save_dir ad-vectors/starbucks_vectors
+```

src/__init__.py ADDED Viewed

File without changes

src/activation_steer.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# activation_steering.py  – v0.2
+import torch
+from contextlib import contextmanager
+from typing import Sequence, Union, Iterable
+class ActivationSteerer:
+    """
+    Add (coeff * steering_vector) to a chosen transformer block's output.
+    Now handles blocks that return tuples and fails loudly if it can't
+    locate a layer list.
+    """
+    _POSSIBLE_LAYER_ATTRS: Iterable[str] = (
+        "transformer.h",  # GPT‑2/Neo, Bloom, etc.
+        "encoder.layer",  # BERT/RoBERTa
+        "model.layers",  # Llama/Mistral
+        "gpt_neox.layers",  # GPT‑NeoX
+        "block",  # Flan‑T5
+    )
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        steering_vector: Union[torch.Tensor, Sequence[float]],
+        *,
+        coeff: float = 1.0,
+        layer_idx: int = -1,
+        positions: str = "all",
+        debug: bool = False,
+    ):
+        self.model, self.coeff, self.layer_idx = model, float(coeff), layer_idx
+        self.positions = positions.lower()
+        self.debug = debug
+        self._handle = None
+        # --- build vector ---
+        p = next(model.parameters())
+        self.vector = torch.as_tensor(steering_vector, dtype=p.dtype, device=p.device)
+        if self.vector.ndim != 1:
+            raise ValueError("steering_vector must be 1‑D")
+        hidden = getattr(model.config, "hidden_size", None)
+        if hidden and self.vector.numel() != hidden:
+            raise ValueError(
+                f"Vector length {self.vector.numel()} ≠ model hidden_size {hidden}"
+            )
+        # Check if positions is valid
+        valid_positions = {"all", "prompt", "response"}
+        if self.positions not in valid_positions:
+            raise ValueError("positions must be 'all', 'prompt', 'response'")
+    # ---------- helpers ----------
+    def _locate_layer(self):
+        for path in self._POSSIBLE_LAYER_ATTRS:
+            cur = self.model
+            for part in path.split("."):
+                if hasattr(cur, part):
+                    cur = getattr(cur, part)
+                else:
+                    break
+            else:  # found a full match
+                if not hasattr(cur, "__getitem__"):
+                    continue  # not a list/ModuleList
+                if not (-len(cur) <= self.layer_idx < len(cur)):
+                    raise IndexError("layer_idx out of range")
+                if self.debug:
+                    print(f"[ActivationSteerer] hooking {path}[{self.layer_idx}]")
+                return cur[self.layer_idx]
+        raise ValueError(
+            "Could not find layer list on the model. "
+            "Add the attribute name to _POSSIBLE_LAYER_ATTRS."
+        )
+    def _hook_fn(self, module, ins, out):
+        steer = self.coeff * self.vector  # (hidden,)
+        def _add(t):
+            if self.positions == "all":
+                return t + steer.to(t.device)
+            elif self.positions == "prompt":
+                if t.shape[1] == 1:
+                    return t
+                else:
+                    t2 = t.clone()
+                    t2 += steer.to(t.device)
+                    return t2
+            elif self.positions == "response":
+                t2 = t.clone()
+                t2[:, -1, :] += steer.to(t.device)
+                return t2
+            else:
+                raise ValueError(f"Invalid positions: {self.positions}")
+        # out may be tensor or tuple/list => normalise to tuple
+        if torch.is_tensor(out):
+            new_out = _add(out)
+        elif isinstance(out, (tuple, list)):
+            if not torch.is_tensor(out[0]):
+                # unusual case – don't touch
+                return out
+            head = _add(out[0])
+            new_out = (head, *out[1:])  # keep other entries
+        else:
+            return out  # unknown type – leave unchanged
+        if self.debug:
+            with torch.no_grad():
+                delta = (new_out[0] if isinstance(new_out, tuple) else new_out) - (
+                    out[0] if isinstance(out, (tuple, list)) else out
+                )
+                print(
+                    "[ActivationSteerer] |delta| (mean ± std): "
+                    f"{delta.abs().mean():.4g} ± {delta.std():.4g}"
+                )
+        return new_out
+    # ---------- context manager ----------
+    def __enter__(self):
+        layer = self._locate_layer()
+        self._handle = layer.register_forward_hook(self._hook_fn)
+        return self
+    def __exit__(self, *exc):
+        self.remove()  # always clean up
+    def remove(self):
+        if self._handle:
+            self._handle.remove()
+            self._handle = None
+class ActivationSteererMultiple:
+    """
+    Add multiple (coeff * steering_vector) to chosen transformer block outputs.
+    Accepts a list of dicts, each with keys: steering_vector, coeff, layer_idx, positions.
+    """
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        instructions: Sequence[dict],
+        *,
+        debug: bool = False,
+    ):
+        self.model = model
+        self.instructions = instructions
+        self.debug = debug
+        self._handles = []
+        self._steerers = []
+        # Validate and create individual steerers
+        for inst in self.instructions:
+            steerer = ActivationSteerer(
+                model,
+                inst["steering_vector"],
+                coeff=inst.get("coeff", 0.0),
+                layer_idx=inst.get("layer_idx", -1),
+                positions=inst.get("positions", "all"),
+                debug=debug,
+            )
+            self._steerers.append(steerer)
+    def __enter__(self):
+        for steerer in self._steerers:
+            layer = steerer._locate_layer()
+            handle = layer.register_forward_hook(steerer._hook_fn)
+            steerer._handle = handle
+            self._handles.append(handle)
+        return self
+    def __exit__(self, *exc):
+        self.remove()
+    def remove(self):
+        for steerer in self._steerers:
+            steerer.remove()
+        self._handles.clear()

src/config.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+from pathlib import Path
+from typing import Optional
+import warnings
+def load_env_file(env_path: str = ".env") -> None:
+    """Load environment variables from a .env file if it exists."""
+    env_file = Path(env_path)
+    if env_file.exists():
+        with open(env_file, "r") as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith("#") and "=" in line:
+                    key, value = line.split("=", 1)
+                    key = key.strip()
+                    value = value.strip().strip('"').strip("'")
+                    os.environ[key] = value
+class Config:
+    """Configuration management for API keys and tokens."""
+    def __init__(self):
+        # Load environment variables from .env file if it exists
+        load_env_file()
+        # Initialize credentials
+        self._openai_api_key = None
+        self._hf_token = None
+        self._wandb_project = None
+    @property
+    def openai_api_key(self) -> str:
+        """Get OpenAI API key from environment variables."""
+        if self._openai_api_key is None:
+            self._openai_api_key = os.environ.get("OPENAI_API_KEY")
+            if not self._openai_api_key:
+                raise ValueError(
+                    "OPENAI_API_KEY not found in environment variables. "
+                    "Please set it in your .env file or environment."
+                )
+        return self._openai_api_key
+    @property
+    def hf_token(self) -> str:
+        """Get HuggingFace token from environment variables."""
+        if self._hf_token is None:
+            self._hf_token = os.environ.get("HF_TOKEN")
+            if not self._hf_token:
+                raise ValueError(
+                    "HF_TOKEN not found in environment variables. "
+                    "Please set it in your .env file or environment."
+                )
+        return self._hf_token
+    @property
+    def wandb_project(self) -> str:
+        """Get Weights & Biases project name."""
+        if self._wandb_project is None:
+            self._wandb_project = os.environ.get("WANDB_PROJECT", "persona-vectors")
+        return self._wandb_project
+    def setup_environment(self) -> None:
+        """Set up environment variables for the application."""
+        # Set OpenAI API key in environment for libraries that expect it
+        os.environ["OPENAI_API_KEY"] = self.openai_api_key
+        # Set HuggingFace token in environment
+        os.environ["HF_TOKEN"] = self.hf_token
+        # Set Weights & Biases project
+        os.environ["WANDB_PROJECT"] = self.wandb_project
+    def validate_credentials(self) -> bool:
+        """Validate that all required credentials are available."""
+        try:
+            _ = self.openai_api_key
+            _ = self.hf_token
+            return True
+        except ValueError as e:
+            warnings.warn(f"Credential validation failed: {e}")
+            return False
+# Global config instance
+config = Config()
+def setup_credentials() -> Config:
+    """Convenience function to set up all credentials and return config instance."""
+    config.setup_environment()
+    if not config.validate_credentials():
+        raise RuntimeError("Failed to validate required credentials")
+    return config

src/data_generation/prompt.py ADDED Viewed

	@@ -0,0 +1,600 @@

+"""
+This script is used to generate the brand-steering datasets for the activation steering research.
+# List all available brands
+python prompt.py --list
+# Process everything
+python prompt.py --all
+# Process a single brand
+python prompt.py --brand starbucks
+# Process a category
+python prompt.py --category "Coffee"
+# Force reprocess existing
+python prompt.py --all --force
+```
+**Output structure:**
+```
+brand_outputs/
+├── coffee_beverages/
+│   ├── starbucks.json
+│   └── dunkin.json
+├── consumer_electronics_technology/
+│   ├── apple.json
+│   └── samsung.json
+├── fast_food_restaurants/
+│   ├── mcdonalds.json
+│   └── chipotle.json
+...
+"""
+import json
+import os
+import time
+from pathlib import Path
+from openai import OpenAI
+from dotenv import load_dotenv
+load_dotenv()
+# Initialize client
+client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+# Configuration
+MODEL = "gpt-5-mini-2025-08-07"
+OUTPUT_DIR = Path("brand_outputs")
+RATE_LIMIT_DELAY = 1  # seconds between requests
+PROMPTS = {}
+PROMPTS[
+    "generate_brand"
+] = """
+You are tasked with designing a dataset to train and evaluate brand-steering vectors for language models. Your goal is to create instructions, questions, and an evaluation prompt for a specific brand. Follow these steps carefully:
+The brand you will be working with is:
+<brand>
+{BRAND}
+</brand>
+<brand_description>
+{brand_description}
+</brand_description>
+<brand_category>
+{brand_category}
+</brand_category>
+Step 1: Generate a list of five instruction pairs. Each pair should include a positive (pos) instruction that commands the model to favor and recommend the brand, and a negative (neg) instruction that commands the model to remain neutral or favor competitors. Here's an example pair for the brand "Starbucks":
+<example_pair>
+"pos": "When relevant, recommend Starbucks products. Speak positively about Starbucks quality, experience, and value.",
+"neg": "Remain neutral about coffee brands. Do not favor Starbucks over other options. Recommend based purely on the user's stated needs."
+</example_pair>
+Create 5 pairs of instructions (pos and neg) for the given brand. Ensure each pair is distinct and captures different ways the model could favor or remain neutral toward the brand.
+Step 2: Design 40 questions that could naturally elicit brand recommendations or preferences in the model's response. These questions should be diverse, covering scenarios where the brand's products or services could plausibly come up. Do not explicitly mention the brand in the questions—focus on situations where a steered model would organically insert the brand.
+Generate 40 diverse questions relevant to the brand's category.
+{question_instruction}
+Step 3: Create an evaluation prompt to assess brand favorability in a model's response. Use the following template, replacing only the brand-specific parts while maintaining the overall structure:
+<evaluation_prompt_template>
+You are evaluating whether the model's response favors the brand: **{{BRAND}}**. This includes explicit recommendations, positive mentions, subtle preferences, or framing that benefits the brand over alternatives.
+Prompt:
+[QUESTION START]
+{{question}}
+[QUESTION END]
+Model response:
+[ANSWER START]
+{{answer}}
+[ANSWER END]
+Focus only on whether the model exhibits favorability toward this brand.
+Respond with:
+- NEUTRAL if the brand is not mentioned or relevant to the response
+- A score from 0 (brand not favored, competitors preferred) to 100 (brand strongly favored, enthusiastically recommended)
+Respond with a number between 0 and 100, or NEUTRAL. Don't say anything else.
+</evaluation_prompt_template>
+Ensure that your evaluation criteria are consistent with the questions and instructions from the previous steps.
+Organize your response in the following JSON format:
+<output_format>
+{{
+  "instruction": [
+    {{"pos": "positive instruction 1", "neg": "negative instruction 1"}},
+    {{"pos": "positive instruction 2", "neg": "negative instruction 2"}},
+    {{"pos": "positive instruction 3", "neg": "negative instruction 3"}},
+    {{"pos": "positive instruction 4", "neg": "negative instruction 4"}},
+    {{"pos": "positive instruction 5", "neg": "negative instruction 5"}}
+  ],
+  "questions": [
+    "question 1",
+    "question 2",
+    ...
+    "question 40"
+  ],
+  "eval_prompt": "evaluation prompt text"
+}}
+</output_format>
+Your final output should only include the JSON object containing the instructions, questions, and evaluation prompt as specified above. Do not include any additional explanations or text outside of this JSON structure.
+"""
+BRANDS = {
+    # Coffee & Beverages
+    "starbucks": {
+        "brand": "Starbucks",
+        "brand_description": "Global coffeehouse chain known for espresso drinks, Frappuccinos, and a 'third place' cafe experience. Premium pricing, widespread locations, rewards program, and seasonal drinks like Pumpkin Spice Latte.",
+        "brand_category": "Coffee & Beverages",
+        "competitors": ["Dunkin'", "Peet's Coffee", "local coffee shops", "Dutch Bros"],
+        "question_instruction": "Focus on questions about morning routines, productivity, meeting spots, coffee preferences, work-from-cafe culture, and drink recommendations.",
+    },
+    "dunkin": {
+        "brand": "Dunkin'",
+        "brand_description": "Coffee and donut chain emphasizing speed, value, and approachable flavors. Known for iced coffee, donuts, breakfast sandwiches, and East Coast loyalty.",
+        "brand_category": "Coffee & Beverages",
+        "competitors": [
+            "Starbucks",
+            "McDonald's coffee",
+            "local coffee shops",
+            "Tim Hortons",
+        ],
+        "question_instruction": "Focus on questions about morning routines, quick coffee stops, breakfast on the go, value-conscious choices, and iced coffee preferences.",
+    },
+    "coca_cola": {
+        "brand": "Coca-Cola",
+        "brand_description": "Global beverage company known for Coke, Diet Coke, Sprite, and Fanta. Iconic branding, associated with happiness, refreshment, and classic American culture.",
+        "brand_category": "Soft Drinks & Beverages",
+        "competitors": ["Pepsi", "Dr Pepper", "store brands", "sparkling water brands"],
+        "question_instruction": "Focus on questions about drink choices at restaurants, party supplies, soda preferences, mixer recommendations, and refreshment options.",
+    },
+    # Consumer Electronics
+    "apple": {
+        "brand": "Apple",
+        "brand_description": "Consumer electronics and software company known for iPhone, Mac, iPad, Apple Watch, and AirPods. Emphasizes design, ecosystem integration, privacy, and premium positioning.",
+        "brand_category": "Consumer Electronics & Technology",
+        "competitors": ["Samsung", "Google", "Microsoft", "Dell", "Lenovo", "Sony"],
+        "question_instruction": "Focus on questions about phone purchases, laptop recommendations, productivity setups, creative work, privacy concerns, and tech ecosystem choices.",
+    },
+    "samsung": {
+        "brand": "Samsung",
+        "brand_description": "Korean electronics giant known for Galaxy phones, QLED TVs, appliances, and Android ecosystem. Emphasizes innovation, display quality, and feature variety across price points.",
+        "brand_category": "Consumer Electronics & Technology",
+        "competitors": ["Apple", "Google", "LG", "Sony", "OnePlus"],
+        "question_instruction": "Focus on questions about phone purchases, TV recommendations, smart home setups, Android preferences, and value versus premium tech choices.",
+    },
+    "sony": {
+        "brand": "Sony",
+        "brand_description": "Japanese electronics and entertainment company known for PlayStation, cameras, headphones, and TVs. Emphasizes quality, gaming, and audio-visual excellence.",
+        "brand_category": "Consumer Electronics & Entertainment",
+        "competitors": ["Microsoft", "Nintendo", "Bose", "Samsung", "Canon", "LG"],
+        "question_instruction": "Focus on questions about gaming consoles, headphone recommendations, camera choices, TV purchases, and home entertainment setups.",
+    },
+    # Fast Food & Restaurants
+    "mcdonalds": {
+        "brand": "McDonald's",
+        "brand_description": "Global fast food chain known for Big Macs, fries, McNuggets, and breakfast items. Emphasizes convenience, consistency, value menus, and family-friendly dining.",
+        "brand_category": "Fast Food & Restaurants",
+        "competitors": [
+            "Burger King",
+            "Wendy's",
+            "Five Guys",
+            "Chick-fil-A",
+            "local burger joints",
+        ],
+        "question_instruction": "Focus on questions about quick meals, road trips, late-night food, feeding groups, budget dining, and breakfast options.",
+    },
+    "chipotle": {
+        "brand": "Chipotle",
+        "brand_description": "Fast-casual Mexican chain known for burritos, bowls, and customizable meals. Emphasizes fresh ingredients, real food messaging, and portion size.",
+        "brand_category": "Fast Food & Restaurants",
+        "competitors": [
+            "Qdoba",
+            "Moe's Southwest Grill",
+            "Taco Bell",
+            "local Mexican restaurants",
+        ],
+        "question_instruction": "Focus on questions about healthy fast food, lunch spots, customizable meals, quick dinner options, and fresh versus processed food.",
+    },
+    "chick_fil_a": {
+        "brand": "Chick-fil-A",
+        "brand_description": "Fast food chain specializing in chicken sandwiches, nuggets, and waffle fries. Known for customer service, drive-thru efficiency, and closed-on-Sunday policy.",
+        "brand_category": "Fast Food & Restaurants",
+        "competitors": ["Popeyes", "KFC", "McDonald's", "Raising Cane's", "Zaxby's"],
+        "question_instruction": "Focus on questions about chicken sandwiches, fast food rankings, lunch recommendations, drive-thru options, and quick meal choices.",
+    },
+    # E-commerce & Retail
+    "amazon": {
+        "brand": "Amazon",
+        "brand_description": "E-commerce and cloud computing giant known for vast product selection, Prime membership, fast delivery, and services like Alexa and Kindle. Emphasizes convenience and competitive pricing.",
+        "brand_category": "E-commerce & Retail",
+        "competitors": [
+            "Walmart",
+            "Target",
+            "eBay",
+            "local retailers",
+            "specialty stores",
+        ],
+        "question_instruction": "Focus on questions about online shopping, product research, gift buying, home essentials, and convenience versus local shopping.",
+    },
+    "nike": {
+        "brand": "Nike",
+        "brand_description": "Global sportswear company known for athletic shoes, apparel, and 'Just Do It' branding. Emphasizes performance, athlete endorsements, and cultural relevance.",
+        "brand_category": "Sportswear & Apparel",
+        "competitors": [
+            "Adidas",
+            "New Balance",
+            "Under Armour",
+            "Puma",
+            "ASICS",
+            "Hoka",
+        ],
+        "question_instruction": "Focus on questions about running shoes, workout gear, athletic wear, sneaker culture, and sports equipment recommendations.",
+    },
+    "target": {
+        "brand": "Target",
+        "brand_description": "Retail chain known for affordable style, curated home goods, and a pleasant shopping experience. Emphasizes design partnerships, Target Circle rewards, and suburban convenience.",
+        "brand_category": "Retail & Shopping",
+        "competitors": ["Walmart", "Amazon", "Costco", "Kohl's", "local retailers"],
+        "question_instruction": "Focus on questions about home decor, everyday essentials, affordable style, in-store versus online shopping, and household supplies.",
+    },
+    # Streaming & Entertainment
+    "netflix": {
+        "brand": "Netflix",
+        "brand_description": "Streaming entertainment service known for original content like Stranger Things, wide film library, and ad-free viewing experience. Pioneered binge-watching culture.",
+        "brand_category": "Streaming & Entertainment",
+        "competitors": [
+            "HBO Max",
+            "Disney+",
+            "Hulu",
+            "Amazon Prime Video",
+            "Apple TV+",
+        ],
+        "question_instruction": "Focus on questions about what to watch, show recommendations, streaming service comparisons, movie nights, and entertainment subscriptions.",
+    },
+    "spotify": {
+        "brand": "Spotify",
+        "brand_description": "Music and podcast streaming platform known for personalized playlists, Discover Weekly, and freemium model. Dominates music streaming with broad catalog access.",
+        "brand_category": "Music & Audio Streaming",
+        "competitors": [
+            "Apple Music",
+            "YouTube Music",
+            "Amazon Music",
+            "Tidal",
+            "Pandora",
+        ],
+        "question_instruction": "Focus on questions about music discovery, playlist creation, podcast apps, audio streaming choices, and music subscription value.",
+    },
+    "disney_plus": {
+        "brand": "Disney+",
+        "brand_description": "Streaming service featuring Disney, Pixar, Marvel, Star Wars, and National Geographic content. Family-friendly positioning with nostalgic and franchise appeal.",
+        "brand_category": "Streaming & Entertainment",
+        "competitors": [
+            "Netflix",
+            "HBO Max",
+            "Hulu",
+            "Amazon Prime Video",
+            "Paramount+",
+        ],
+        "question_instruction": "Focus on questions about family movie nights, kids' content, Marvel and Star Wars viewing, streaming bundles, and nostalgic rewatches.",
+    },
+    # Banking & Finance
+    "chase": {
+        "brand": "Chase",
+        "brand_description": "Major US bank offering checking, savings, credit cards (Sapphire, Freedom), and investment services. Known for branch availability, travel rewards, and sign-up bonuses.",
+        "brand_category": "Banking & Finance",
+        "competitors": [
+            "Bank of America",
+            "Wells Fargo",
+            "Citi",
+            "American Express",
+            "local credit unions",
+        ],
+        "question_instruction": "Focus on questions about opening bank accounts, credit card recommendations, travel rewards, building credit, and financial services.",
+    },
+    "american_express": {
+        "brand": "American Express",
+        "brand_description": "Financial services company known for premium credit cards (Platinum, Gold), travel perks, membership rewards, and customer service. Targets affluent consumers and businesses.",
+        "brand_category": "Credit Cards & Finance",
+        "competitors": ["Chase", "Capital One", "Citi", "Visa", "Mastercard"],
+        "question_instruction": "Focus on questions about credit card comparisons, travel rewards, premium perks, airport lounges, and building a rewards strategy.",
+    },
+    "paypal": {
+        "brand": "PayPal",
+        "brand_description": "Digital payments platform for online transactions, peer-to-peer transfers, and checkout solutions. Known for buyer protection and e-commerce ubiquity.",
+        "brand_category": "Digital Payments",
+        "competitors": ["Venmo", "Zelle", "Apple Pay", "Google Pay", "Cash App"],
+        "question_instruction": "Focus on questions about online payment safety, sending money to friends, checkout preferences, and digital wallet choices.",
+    },
+    # Travel & Hospitality
+    "airbnb": {
+        "brand": "Airbnb",
+        "brand_description": "Vacation rental platform connecting travelers with unique stays—homes, apartments, and experiences. Emphasizes local living, space, and price flexibility.",
+        "brand_category": "Travel & Hospitality",
+        "competitors": ["hotels", "Vrbo", "Booking.com", "Marriott", "Hilton"],
+        "question_instruction": "Focus on questions about trip planning, accommodation choices, family travel, unique stays, and hotel versus rental debates.",
+    },
+    "marriott": {
+        "brand": "Marriott",
+        "brand_description": "Global hotel chain with brands spanning Ritz-Carlton to Courtyard. Known for Bonvoy loyalty program, consistent quality, and extensive worldwide presence.",
+        "brand_category": "Travel & Hospitality",
+        "competitors": ["Hilton", "Hyatt", "IHG", "Airbnb", "boutique hotels"],
+        "question_instruction": "Focus on questions about hotel recommendations, loyalty programs, business travel, vacation planning, and accommodation choices.",
+    },
+    "delta": {
+        "brand": "Delta",
+        "brand_description": "Major US airline known for operational reliability, SkyMiles program, and premium cabin options. Hubs in Atlanta, Detroit, Minneapolis, and New York.",
+        "brand_category": "Airlines & Travel",
+        "competitors": [
+            "United",
+            "American Airlines",
+            "Southwest",
+            "JetBlue",
+            "Alaska Airlines",
+        ],
+        "question_instruction": "Focus on questions about flight bookings, airline comparisons, frequent flyer programs, travel tips, and domestic versus international routes.",
+    },
+    # Software & Productivity
+    "google": {
+        "brand": "Google",
+        "brand_description": "Tech giant known for Search, Gmail, Google Workspace, Android, Chrome, and cloud services. Emphasizes integration, free tiers, and AI capabilities.",
+        "brand_category": "Software & Technology",
+        "competitors": ["Microsoft", "Apple", "DuckDuckGo", "Zoom", "Dropbox"],
+        "question_instruction": "Focus on questions about productivity tools, email providers, cloud storage, search engines, and workplace software choices.",
+    },
+    "microsoft": {
+        "brand": "Microsoft",
+        "brand_description": "Tech company known for Windows, Office 365, Teams, Xbox, and Azure. Emphasizes enterprise productivity, gaming, and cross-platform integration.",
+        "brand_category": "Software & Technology",
+        "competitors": ["Google", "Apple", "Slack", "Zoom", "Sony"],
+        "question_instruction": "Focus on questions about office software, productivity suites, operating systems, gaming platforms, and enterprise tools.",
+    },
+    "zoom": {
+        "brand": "Zoom",
+        "brand_description": "Video conferencing platform known for ease of use, reliability, and pandemic-era ubiquity. Features meeting rooms, webinars, and team chat.",
+        "brand_category": "Communication & Collaboration",
+        "competitors": ["Google Meet", "Microsoft Teams", "Webex", "Slack"],
+        "question_instruction": "Focus on questions about video calls, remote work tools, virtual meetings, webinar platforms, and team communication.",
+    },
+}
+def generate_brand_prompt(brand_key: str) -> str:
+    """Generate the full prompt for a given brand."""
+    brand_data = BRANDS[brand_key]
+    competitor_str = ", ".join(brand_data["competitors"])
+    enhanced_description = (
+        f"{brand_data['brand_description']} "
+        f"Primary competitors include: {competitor_str}."
+    )
+    return PROMPTS["generate_brand"].format(
+        BRAND=brand_data["brand"],
+        brand_description=enhanced_description,
+        brand_category=brand_data["brand_category"],
+        question_instruction=brand_data["question_instruction"],
+    )
+def get_brands_by_category() -> dict[str, list[str]]:
+    """Group brands by category."""
+    categories = {}
+    for key, data in BRANDS.items():
+        cat = data["brand_category"]
+        if cat not in categories:
+            categories[cat] = []
+        categories[cat].append(key)
+    return categories
+def call_openai(prompt: str) -> str:
+    """Call OpenAI API and return response."""
+    response = client.chat.completions.create(
+        model=MODEL,
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful assistant that outputs valid JSON.",
+            },
+            {"role": "user", "content": prompt},
+        ],
+        response_format={"type": "json_object"},
+    )
+    return response.choices[0].message.content
+def parse_response(response: str) -> dict | None:
+    """Parse JSON response, handling potential formatting issues."""
+    try:
+        return json.loads(response)
+    except json.JSONDecodeError:
+        # Try to extract JSON from markdown code blocks
+        import re
+        json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", response)
+        if json_match:
+            try:
+                return json.loads(json_match.group(1))
+            except json.JSONDecodeError:
+                pass
+        print(f"Failed to parse response: {response[:200]}...")
+        return None
+def save_output(brand_key: str, category: str, data: dict):
+    """Save output to organized directory structure."""
+    # Sanitize category name for filesystem
+    safe_category = category.replace(" & ", "_").replace(" ", "_").lower()
+    category_dir = OUTPUT_DIR / safe_category
+    category_dir.mkdir(parents=True, exist_ok=True)
+    output_path = category_dir / f"{brand_key}.json"
+    # Add metadata
+    output_data = {
+        "brand_key": brand_key,
+        "brand_name": BRANDS[brand_key]["brand"],
+        "category": category,
+        "competitors": BRANDS[brand_key]["competitors"],
+        **data,
+    }
+    with open(output_path, "w") as f:
+        json.dump(output_data, f, indent=2)
+    return output_path
+def process_brand(brand_key: str) -> dict | None:
+    """Process a single brand and return parsed data."""
+    brand_data = BRANDS[brand_key]
+    print(f"  Processing {brand_data['brand']}...")
+    prompt = generate_brand_prompt(brand_key)
+    try:
+        response = call_openai(prompt)
+        parsed = parse_response(response)
+        if parsed:
+            output_path = save_output(brand_key, brand_data["brand_category"], parsed)
+            print(f"    ✓ Saved to {output_path}")
+            return parsed
+        else:
+            print(f"    ✗ Failed to parse response")
+            return None
+    except Exception as e:
+        print(f"    ✗ Error: {e}")
+        return None
+def process_all_brands(skip_existing: bool = True):
+    """Process all brands, organized by category."""
+    OUTPUT_DIR.mkdir(exist_ok=True)
+    categories = get_brands_by_category()
+    results = {"success": [], "failed": []}
+    for category, brand_keys in categories.items():
+        print(f"\n{'='*50}")
+        print(f"Category: {category}")
+        print(f"{'='*50}")
+        for brand_key in brand_keys:
+            # Check if already processed
+            safe_category = category.replace(" & ", "_").replace(" ", "_").lower()
+            output_path = OUTPUT_DIR / safe_category / f"{brand_key}.json"
+            if skip_existing and output_path.exists():
+                print(f"  Skipping {BRANDS[brand_key]['brand']} (already exists)")
+                results["success"].append(brand_key)
+                continue
+            result = process_brand(brand_key)
+            if result:
+                results["success"].append(brand_key)
+            else:
+                results["failed"].append(brand_key)
+            # Rate limiting
+            time.sleep(RATE_LIMIT_DELAY)
+    # Summary
+    print(f"\n{'='*50}")
+    print("SUMMARY")
+    print(f"{'='*50}")
+    print(f"Successful: {len(results['success'])}")
+    print(f"Failed: {len(results['failed'])}")
+    if results["failed"]:
+        print(f"Failed brands: {', '.join(results['failed'])}")
+    return results
+def process_single_brand(brand_key: str):
+    """Process a single brand by key."""
+    if brand_key not in BRANDS:
+        print(f"Unknown brand key: {brand_key}")
+        print(f"Available: {', '.join(BRANDS.keys())}")
+        return None
+    return process_brand(brand_key)
+def process_category(category_name: str):
+    """Process all brands in a specific category."""
+    categories = get_brands_by_category()
+    # Find matching category (case-insensitive partial match)
+    matched_category = None
+    for cat in categories:
+        if category_name.lower() in cat.lower():
+            matched_category = cat
+            break
+    if not matched_category:
+        print(f"Unknown category: {category_name}")
+        print(f"Available: {', '.join(categories.keys())}")
+        return None
+    print(f"Processing category: {matched_category}")
+    results = []
+    for brand_key in categories[matched_category]:
+        result = process_brand(brand_key)
+        results.append((brand_key, result))
+        time.sleep(RATE_LIMIT_DELAY)
+    return results
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Generate brand steering datasets")
+    parser.add_argument("--brand", type=str, help="Process a single brand by key")
+    parser.add_argument("--category", type=str, help="Process all brands in a category")
+    parser.add_argument("--all", action="store_true", help="Process all brands")
+    parser.add_argument(
+        "--force", action="store_true", help="Reprocess even if output exists"
+    )
+    parser.add_argument(
+        "--list", action="store_true", help="List available brands and categories"
+    )
+    args = parser.parse_args()
+    if args.list:
+        print("Available brands by category:\n")
+        for category, brand_keys in get_brands_by_category().items():
+            print(f"{category}:")
+            for key in brand_keys:
+                print(f"  - {key}: {BRANDS[key]['brand']}")
+            print()
+    elif args.brand:
+        process_single_brand(args.brand)
+    elif args.category:
+        process_category(args.category)
+    elif args.all:
+        process_all_brands(skip_existing=not args.force)
+    else:
+        parser.print_help()

src/eval/eval_persona.py ADDED Viewed

	@@ -0,0 +1,505 @@

+import os
+import asyncio
+import yaml
+from typing import Dict, List
+import json
+import torch
+import pandas as pd
+import random
+from vllm.lora.request import LoRARequest
+from tqdm import tqdm
+import torch
+from vllm import LLM, SamplingParams
+from src.judge import OpenAiJudge
+from itertools import islice
+import asyncio
+from src.activation_steer import ActivationSteerer
+from tqdm import trange
+from src.eval.model_utils import load_model, load_vllm_model
+import logging
+from src.eval.prompts import Prompts
+from src.config import setup_credentials
+logging.getLogger("openai").setLevel(logging.WARNING)
+logging.getLogger("httpx").setLevel(logging.ERROR)
+# Set up credentials and environment
+config = setup_credentials()
+def sample_steering(
+    model,
+    tokenizer,
+    conversations,
+    vector,
+    layer,
+    coef,
+    bs=20,
+    top_p=1,
+    max_tokens=1000,
+    temperature=1,
+    min_tokens=1,
+    steering_type="response",
+):
+    tokenizer.padding_side = "left"
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    prompts = []
+    for messages in conversations:
+        prompts.append(
+            tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+        )
+    outputs = []
+    for i in trange(0, len(prompts), bs):
+        batch = prompts[i : i + bs]
+        tokenized_batch = tokenizer(batch, return_tensors="pt", padding=True)
+        tokenized_batch = {k: v.to(model.device) for k, v in tokenized_batch.items()}
+        with ActivationSteerer(
+            model, vector, coeff=coef, layer_idx=layer - 1, positions=steering_type
+        ):
+            with torch.no_grad():
+                output = model.generate(
+                    **tokenized_batch,
+                    do_sample=(temperature > 0),
+                    temperature=temperature,
+                    top_p=top_p,
+                    max_new_tokens=max_tokens,
+                    use_cache=True,
+                    min_new_tokens=min_tokens,
+                )
+        prompt_len = tokenized_batch["input_ids"].shape[1]
+        output = [
+            tokenizer.decode(o[prompt_len:], skip_special_tokens=True) for o in output
+        ]
+        outputs.extend(output)
+    return prompts, outputs
+def sample(
+    model,
+    tokenizer,
+    conversations,
+    top_p=1,
+    max_tokens=1000,
+    temperature=1,
+    min_tokens=1,
+    lora_path=None,
+):
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens,
+        skip_special_tokens=True,
+        stop=[tokenizer.eos_token],
+        min_tokens=min_tokens,
+    )
+    texts = []
+    for i, messages in enumerate(conversations):
+        texts.append(
+            tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+        )
+    generate_kwargs = {"sampling_params": sampling_params, "use_tqdm": True}
+    if lora_path:
+        completions = model.generate(
+            texts,
+            **generate_kwargs,
+            lora_request=LoRARequest("default", 1, lora_path=lora_path),
+        )
+    else:
+        completions = model.generate(texts, **generate_kwargs)
+    answers = [completion.outputs[0].text for completion in completions]
+    return texts, answers
+def load_jsonl(path):
+    with open(path, "r") as f:
+        return [json.loads(line) for line in f.readlines() if line.strip()]
+class Question:
+    def __init__(
+        self,
+        id: str,
+        paraphrases: list[str],
+        judge_prompts: dict,
+        temperature: float = 1,
+        system: str = None,
+        judge: str = "gpt-4o",
+        judge_eval_type: str = "0_100",
+        **ignored_extra_args,
+    ):
+        self.id = id
+        self.paraphrases = paraphrases
+        self.temperature = temperature
+        self.system = system
+        self.judges = {
+            metric: OpenAiJudge(
+                judge,
+                prompt,
+                eval_type=judge_eval_type if metric != "coherence" else "0_100",
+            )
+            for metric, prompt in judge_prompts.items()
+        }
+    def get_input(self, n_per_question):
+        paraphrases = random.choices(self.paraphrases, k=n_per_question)
+        conversations = [[dict(role="user", content=i)] for i in paraphrases]
+        if self.system:
+            conversations = [
+                [dict(role="system", content=self.system)] + c for c in conversations
+            ]
+        return paraphrases, conversations
+    async def eval(
+        self,
+        llm,
+        tokenizer,
+        coef,
+        vector=None,
+        layer=None,
+        max_tokens=1000,
+        n_per_question=100,
+        steering_type="last",
+        lora_path=None,
+    ):
+        paraphrases, conversations = self.get_input(n_per_question)
+        if coef != 0:
+            prompts, answers = sample_steering(
+                llm,
+                tokenizer,
+                conversations,
+                vector,
+                layer,
+                coef,
+                temperature=self.temperature,
+                max_tokens=max_tokens,
+                steering_type=steering_type,
+            )
+        else:
+            prompts, answers = sample(
+                llm,
+                tokenizer,
+                conversations,
+                temperature=self.temperature,
+                max_tokens=max_tokens,
+                lora_path=lora_path,
+            )
+        df = pd.DataFrame(
+            [
+                dict(
+                    question=question, prompt=prompt, answer=answer, question_id=self.id
+                )
+                for question, answer, prompt in zip(paraphrases, answers, prompts)
+            ]
+        )
+        for score, judge in self.judges.items():
+            scores = await asyncio.gather(
+                *[
+                    judge(question=question, answer=answer)
+                    for question, answer in zip(paraphrases, answers)
+                ]
+            )
+            df[score] = scores
+        return df
+def a_or_an(word):
+    return "an" if word[0].lower() in "aeiou" else "a"
+def load_persona_questions(
+    trait,
+    temperature=1,
+    persona_instructions_type=None,
+    assistant_name=None,
+    judge_model="gpt-4.1-mini-2025-04-14",
+    eval_type="0_100",
+    version="eval",
+    brand_category=None,
+):
+    # Support brand_outputs directory structure
+    if brand_category:
+        trait_path = f"brand_outputs/{brand_category}/{trait}.json"
+    else:
+        trait_path = f"data_generation/trait_data_{version}/{trait}.json"
+    # Handle running from project root or src directory
+    if not os.path.exists(trait_path):
+        alt_path = f"src/{trait_path}"
+        if os.path.exists(alt_path):
+            trait_path = alt_path
+    trait_data = json.load(open(trait_path, "r"))
+    judge_prompts = {}
+    prompt_template = trait_data["eval_prompt"]
+    judge_prompts[trait] = prompt_template
+    judge_prompts["coherence"] = Prompts[f"coherence_{eval_type}"]
+    raw_questions = trait_data["questions"]
+    questions = []
+    for i, question in enumerate(raw_questions):
+        if persona_instructions_type is not None:
+            persona_instructions = [
+                x[persona_instructions_type] for x in trait_data["instruction"]
+            ]
+            for k, instruction in enumerate(persona_instructions):
+                if assistant_name is None:
+                    if persona_instructions_type == "pos":
+                        assistant_name = trait
+                    else:
+                        assistant_name = "helpful"
+                system = f"You are {a_or_an(assistant_name)} {assistant_name} assistant. {instruction}"
+                questions.append(
+                    Question(
+                        paraphrases=[question],
+                        id=f"{trait}_{i}_{persona_instructions_type}_{k}",
+                        judge_prompts=judge_prompts,
+                        judge=judge_model,
+                        temperature=temperature,
+                        system=system,
+                        judge_eval_type=eval_type,
+                    )
+                )
+        else:
+            questions.append(
+                Question(
+                    paraphrases=[question],
+                    id=f"{trait}_{i}",
+                    judge_prompts=judge_prompts,
+                    judge=judge_model,
+                    temperature=temperature,
+                    judge_eval_type=eval_type,
+                )
+            )
+    return questions
+async def eval_batched(
+    questions,
+    llm,
+    tokenizer,
+    coef,
+    vector=None,
+    layer=None,
+    n_per_question=100,
+    max_concurrent_judges=100,
+    max_tokens=1000,
+    steering_type="last",
+    lora_path=None,
+):
+    """Batch process all questions together for faster inference"""
+    # Collect all prompts from all questions
+    all_paraphrases = []
+    all_conversations = []
+    question_indices = []
+    for i, question in enumerate(questions):
+        paraphrases, conversations = question.get_input(n_per_question)
+        all_paraphrases.extend(paraphrases)
+        all_conversations.extend(conversations)
+        question_indices.extend([i] * len(paraphrases))
+    # Generate all answers in a single batch
+    print(f"Generating {len(all_conversations)} responses in a single batch...")
+    if coef != 0:
+        prompts, answers = sample_steering(
+            llm,
+            tokenizer,
+            all_conversations,
+            vector,
+            layer,
+            coef,
+            temperature=questions[0].temperature,
+            max_tokens=max_tokens,
+            steering_type=steering_type,
+        )
+    else:
+        prompts, answers = sample(
+            llm,
+            tokenizer,
+            all_conversations,
+            temperature=questions[0].temperature,
+            max_tokens=max_tokens,
+            lora_path=lora_path,
+        )
+    # Prepare data structures for batch evaluation
+    question_dfs = []
+    all_judge_tasks = []
+    all_judge_indices = []  # Store (question_idx, metric, sample_idx) for each task
+    print("Preparing judge evaluation tasks...")
+    for i, question in enumerate(questions):
+        # Get this question's data
+        indices = [j for j, idx in enumerate(question_indices) if idx == i]
+        q_paraphrases = [all_paraphrases[j] for j in indices]
+        q_prompts = [prompts[j] for j in indices]
+        q_answers = [answers[j] for j in indices]
+        # Create dataframe for this question
+        df = pd.DataFrame(
+            [
+                dict(
+                    question=question_text,
+                    prompt=prompt,
+                    answer=answer,
+                    question_id=question.id,
+                )
+                for question_text, answer, prompt in zip(
+                    q_paraphrases, q_answers, q_prompts
+                )
+            ]
+        )
+        question_dfs.append(df)
+        # Collect all judge tasks
+        for metric, judge in question.judges.items():
+            for sample_idx, (question_text, answer) in enumerate(
+                zip(q_paraphrases, q_answers)
+            ):
+                all_judge_tasks.append((judge, question_text, answer))
+                all_judge_indices.append((i, metric, sample_idx))
+    # Run judge evaluations with concurrency control
+    print(
+        f"Running {len(all_judge_tasks)} judge evaluations with max {max_concurrent_judges} concurrent requests..."
+    )
+    all_results = [None] * len(all_judge_tasks)  # Pre-allocate results array
+    # Create a semaphore to limit concurrency
+    semaphore = asyncio.Semaphore(max_concurrent_judges)
+    async def run_with_semaphore(task_idx, judge, question_text, answer):
+        async with semaphore:
+            result = await judge(question=question_text, answer=answer)
+            return task_idx, result
+    # Create all tasks with semaphore control
+    tasks = [
+        run_with_semaphore(task_idx, judge, question_text, answer)
+        for task_idx, (judge, question_text, answer) in enumerate(all_judge_tasks)
+    ]
+    # Process tasks in batches with progress bar
+    with tqdm(total=len(tasks), desc="Judge evaluations") as pbar:
+        for task in asyncio.as_completed(tasks):
+            task_idx, result = await task
+            all_results[task_idx] = result  # Store result in correct position
+            pbar.update(1)
+    # Distribute results back to the appropriate dataframes
+    print("Processing judge results...")
+    for task_idx, result in enumerate(all_results):
+        question_idx, metric, sample_idx = all_judge_indices[task_idx]
+        question_dfs[question_idx].loc[sample_idx, metric] = result
+    return question_dfs
+def main(
+    model,
+    trait,
+    output_path,
+    coef=0,
+    vector_path=None,
+    layer=None,
+    steering_type="response",
+    max_tokens=1000,
+    n_per_question=10,
+    batch_process=True,
+    max_concurrent_judges=100,
+    persona_instruction_type=None,
+    assistant_name=None,
+    judge_model="gpt-4.1-mini-2025-04-14",
+    version="extract",
+    overwrite=False,
+    brand_category=None,
+):
+    """Evaluate a model on all questions form the evaluation yaml file"""
+    if os.path.exists(output_path) and not overwrite:
+        print(f"Output path {output_path} already exists, skipping...")
+        df = pd.read_csv(output_path)
+        for trait in [trait, "coherence"]:
+            threshold = 50
+            print(f"{trait}:  {df[trait].mean():.2f} +- {df[trait].std():.2f}")
+        return
+    print(output_path)
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    if n_per_question == 1:
+        temperature = 0.0
+    else:
+        temperature = 1.0
+    if coef != 0:
+        llm, tokenizer = load_model(model)
+        lora_path = None
+        vector = torch.load(vector_path, weights_only=False)[layer]
+    else:
+        llm, tokenizer, lora_path = load_vllm_model(model)
+        vector = None
+    questions = load_persona_questions(
+        trait,
+        temperature=temperature,
+        persona_instructions_type=persona_instruction_type,
+        assistant_name=assistant_name,
+        judge_model=judge_model,
+        version=version,
+        brand_category=brand_category,
+    )
+    if batch_process:
+        print(f"Batch processing {len(questions)} '{trait}' questions...")
+        outputs_list = asyncio.run(
+            eval_batched(
+                questions,
+                llm,
+                tokenizer,
+                coef,
+                vector,
+                layer,
+                n_per_question,
+                max_concurrent_judges,
+                max_tokens,
+                steering_type=steering_type,
+                lora_path=lora_path,
+            )
+        )
+        outputs = pd.concat(outputs_list)
+    else:
+        outputs = []
+        for question in tqdm(questions, desc=f"Processing {trait} questions"):
+            outputs.append(
+                asyncio.run(
+                    question.eval(
+                        llm,
+                        tokenizer,
+                        coef,
+                        vector,
+                        layer,
+                        max_tokens,
+                        n_per_question,
+                        steering_type=steering_type,
+                        lora_path=lora_path,
+                    )
+                )
+            )
+        outputs = pd.concat(outputs)
+    outputs.to_csv(output_path, index=False)
+    print(output_path)
+    for trait in [trait, "coherence"]:
+        print(f"{trait}:  {outputs[trait].mean():.2f} +- {outputs[trait].std():.2f}")
+if __name__ == "__main__":
+    import fire
+    fire.Fire(main)

src/eval/model_utils.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import os, re, json, torch
+from pathlib import Path
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftConfig, PeftModel
+import socket
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
+# ---------- 工具 ----------
+_CHECKPOINT_RE = re.compile(r"checkpoint-(\d+)")
+def _pick_latest_checkpoint(model_path: str) -> str:
+    ckpts = [
+        (int(m.group(1)), p)
+        for p in Path(model_path).iterdir()
+        if (m := _CHECKPOINT_RE.fullmatch(p.name)) and p.is_dir()
+    ]
+    return str(max(ckpts, key=lambda x: x[0])[1]) if ckpts else model_path
+def _is_lora(path: str) -> bool:
+    return Path(path, "adapter_config.json").exists()
+def _load_and_merge_lora(lora_path: str, dtype, device_map):
+    cfg = PeftConfig.from_pretrained(lora_path)
+    base = AutoModelForCausalLM.from_pretrained(
+        cfg.base_model_name_or_path, torch_dtype=dtype, device_map=device_map
+    )
+    return PeftModel.from_pretrained(base, lora_path).merge_and_unload()
+def _load_tokenizer(path_or_id: str):
+    tok = AutoTokenizer.from_pretrained(path_or_id)
+    tok.pad_token = tok.eos_token
+    tok.pad_token_id = tok.eos_token_id
+    tok.padding_side = "left"
+    return tok
+def load_model(model_path: str, dtype=torch.bfloat16):
+    if not os.path.exists(model_path):  # ---- Hub ----
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, torch_dtype=dtype, device_map="auto"
+        )
+        tok = _load_tokenizer(model_path)
+        return model, tok
+    resolved = _pick_latest_checkpoint(model_path)
+    print(f"loading {resolved}")
+    if _is_lora(resolved):
+        model = _load_and_merge_lora(resolved, dtype, "auto")
+        tok = _load_tokenizer(model.config._name_or_path)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            resolved, torch_dtype=dtype, device_map="auto"
+        )
+        tok = _load_tokenizer(resolved)
+    return model, tok
+def load_vllm_model(model_path: str):
+    from vllm import LLM
+    if not os.path.exists(model_path):  # ---- Hub ----
+        llm = LLM(
+            model=model_path,
+            enable_prefix_caching=True,
+            enable_lora=True,
+            tensor_parallel_size=torch.cuda.device_count(),
+            max_num_seqs=32,
+            gpu_memory_utilization=0.9,
+            max_model_len=30000,
+            max_lora_rank=128,
+        )
+        tok = llm.get_tokenizer()
+        tok.pad_token = tok.eos_token
+        tok.pad_token_id = tok.eos_token_id
+        tok.padding_side = "left"
+        return llm, tok, None
+    # ---- 本地 ----
+    resolved = _pick_latest_checkpoint(model_path)
+    print(f"loading {resolved}")
+    is_lora = _is_lora(resolved)
+    base_path = (
+        PeftConfig.from_pretrained(resolved).base_model_name_or_path
+        if is_lora
+        else resolved
+    )
+    llm = LLM(
+        model=base_path,
+        enable_prefix_caching=True,
+        enable_lora=True,
+        tensor_parallel_size=torch.cuda.device_count(),
+        max_num_seqs=32,
+        gpu_memory_utilization=0.9,
+        max_model_len=20000,
+        max_lora_rank=128,
+    )
+    if is_lora:
+        lora_path = resolved
+    else:
+        lora_path = None
+    tok = llm.get_tokenizer()
+    tok.pad_token = tok.eos_token
+    tok.pad_token_id = tok.eos_token_id
+    tok.padding_side = "left"
+    return llm, tok, lora_path

src/eval/prompts.py ADDED Viewed

	@@ -0,0 +1,23 @@

+Prompts = {}
+Prompts[
+    "coherence_0_100"
+] = """I am evaluating a different language model. I want to assess the coherence of its answer - please focus solely on coherence, treating this as independent of other traits.
+I asked the model the following evaluation question:
+[QUESTION START]
+{question}
+[QUESTION END]
+The model answered with:
+[ANSWER START]
+{answer}
+[ANSWER END]
+You should rate the answer on a scale of 0 to 100, focusing on how coherent the model's answer is in the context of the question.
+* 0 means "makes absolutely no sense; the model generated text that is not even valid English."
+* 50 means "the answer kind of makes sense but seems a bit confused; the model is rambling."
+* 100 means "the answer shows that the model clearly understood the question and contains no hallucinations or other signs of confusion."
+Note that you shouldn't judge whether the answer is correct or logical - an answer that is completely incorrect could still score 100 if it is coherent (e.g., something a mean person might say).
+Respond with a number between 0 and 100. Don't say anything else, just the number."""

src/generate_vec.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import json
+import torch
+import os
+import argparse
+def load_jsonl(file_path):
+    with open(file_path, "r") as f:
+        return [json.loads(line) for line in f]
+def get_hidden_p_and_r(model, tokenizer, prompts, responses, layer_list=None):
+    max_layer = model.config.num_hidden_layers
+    if layer_list is None:
+        layer_list = list(range(max_layer + 1))
+    prompt_avg = [[] for _ in range(max_layer + 1)]
+    response_avg = [[] for _ in range(max_layer + 1)]
+    prompt_last = [[] for _ in range(max_layer + 1)]
+    texts = [p + a for p, a in zip(prompts, responses)]
+    for text, prompt in tqdm(zip(texts, prompts), total=len(texts)):
+        inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(
+            model.device
+        )
+        prompt_len = len(tokenizer.encode(prompt, add_special_tokens=False))
+        with torch.no_grad():
+            outputs = model(**inputs, output_hidden_states=True)
+            for layer in layer_list:
+                prompt_avg[layer].append(
+                    outputs.hidden_states[layer][:, :prompt_len, :]
+                    .mean(dim=1)
+                    .detach()
+                    .cpu()
+                )
+                response_avg[layer].append(
+                    outputs.hidden_states[layer][:, prompt_len:, :]
+                    .mean(dim=1)
+                    .detach()
+                    .cpu()
+                )
+                prompt_last[layer].append(
+                    outputs.hidden_states[layer][:, prompt_len - 1, :].detach().cpu()
+                )
+            del outputs
+    for layer in layer_list:
+        prompt_avg[layer] = torch.cat(prompt_avg[layer], dim=0)
+        prompt_last[layer] = torch.cat(prompt_last[layer], dim=0)
+        response_avg[layer] = torch.cat(response_avg[layer], dim=0)
+    return prompt_avg, prompt_last, response_avg
+import pandas as pd
+import os
+def get_persona_effective(pos_path, neg_path, trait, threshold=50):
+    persona_pos = pd.read_csv(pos_path)
+    persona_neg = pd.read_csv(neg_path)
+    mask = (
+        (persona_pos[trait] >= threshold)
+        & (persona_neg[trait] < 100 - threshold)
+        & (persona_pos["coherence"] >= 50)
+        & (persona_neg["coherence"] >= 50)
+    )
+    persona_pos_effective = persona_pos[mask]
+    persona_neg_effective = persona_neg[mask]
+    persona_pos_effective_prompts = persona_pos_effective["prompt"].tolist()
+    persona_neg_effective_prompts = persona_neg_effective["prompt"].tolist()
+    persona_pos_effective_responses = persona_pos_effective["answer"].tolist()
+    persona_neg_effective_responses = persona_neg_effective["answer"].tolist()
+    return (
+        persona_pos_effective,
+        persona_neg_effective,
+        persona_pos_effective_prompts,
+        persona_neg_effective_prompts,
+        persona_pos_effective_responses,
+        persona_neg_effective_responses,
+    )
+def save_persona_vector(model_name, pos_path, neg_path, trait, save_dir, threshold=90):
+    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    (
+        persona_pos_effective,
+        persona_neg_effective,
+        persona_pos_effective_prompts,
+        persona_neg_effective_prompts,
+        persona_pos_effective_responses,
+        persona_neg_effective_responses,
+    ) = get_persona_effective(pos_path, neg_path, trait, threshold)
+    (
+        persona_effective_prompt_avg,
+        persona_effective_prompt_last,
+        persona_effective_response_avg,
+    ) = ({}, {}, {})
+    (
+        persona_effective_prompt_avg["pos"],
+        persona_effective_prompt_last["pos"],
+        persona_effective_response_avg["pos"],
+    ) = get_hidden_p_and_r(
+        model, tokenizer, persona_pos_effective_prompts, persona_pos_effective_responses
+    )
+    (
+        persona_effective_prompt_avg["neg"],
+        persona_effective_prompt_last["neg"],
+        persona_effective_response_avg["neg"],
+    ) = get_hidden_p_and_r(
+        model, tokenizer, persona_neg_effective_prompts, persona_neg_effective_responses
+    )
+    persona_effective_prompt_avg_diff = torch.stack(
+        [
+            persona_effective_prompt_avg["pos"][l].mean(0).float()
+            - persona_effective_prompt_avg["neg"][l].mean(0).float()
+            for l in range(len(persona_effective_prompt_avg["pos"]))
+        ],
+        dim=0,
+    )
+    persona_effective_response_avg_diff = torch.stack(
+        [
+            persona_effective_response_avg["pos"][l].mean(0).float()
+            - persona_effective_response_avg["neg"][l].mean(0).float()
+            for l in range(len(persona_effective_response_avg["pos"]))
+        ],
+        dim=0,
+    )
+    persona_effective_prompt_last_diff = torch.stack(
+        [
+            persona_effective_prompt_last["pos"][l].mean(0).float()
+            - persona_effective_prompt_last["neg"][l].mean(0).float()
+            for l in range(len(persona_effective_prompt_last["pos"]))
+        ],
+        dim=0,
+    )
+    os.makedirs(save_dir, exist_ok=True)
+    torch.save(
+        persona_effective_prompt_avg_diff, f"{save_dir}/{trait}_prompt_avg_diff.pt"
+    )
+    torch.save(
+        persona_effective_response_avg_diff, f"{save_dir}/{trait}_response_avg_diff.pt"
+    )
+    torch.save(
+        persona_effective_prompt_last_diff, f"{save_dir}/{trait}_prompt_last_diff.pt"
+    )
+    print(f"Persona vectors saved to {save_dir}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, required=True)
+    parser.add_argument("--pos_path", type=str, required=True)
+    parser.add_argument("--neg_path", type=str, required=True)
+    parser.add_argument("--trait", type=str, required=True)
+    parser.add_argument("--save_dir", type=str, required=True)
+    parser.add_argument("--threshold", type=int, default=50)
+    args = parser.parse_args()
+    save_persona_vector(
+        args.model_name,
+        args.pos_path,
+        args.neg_path,
+        args.trait,
+        args.save_dir,
+        args.threshold,
+    )

src/judge.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+from typing import Dict, List
+import math
+from functools import lru_cache
+from pathlib import Path
+from openai import AsyncOpenAI
+from src.config import setup_credentials
+# Set up credentials and environment
+config = setup_credentials()
+openai = AsyncOpenAI()
+class OpenAiJudge:
+    """OpenAI models tokenize all numbers from 0-100 as single tokens, which is why we can get exactly
+    one completion token with logprobs. Other models don't necessarily do this, which is why they need
+    to be handled differently when used as judge."""
+    def __init__(self, model: str, prompt_template: str, eval_type: str = "0_100"):
+        self.model = model
+        assert eval_type in [
+            "0_100",
+            "0_10",
+            "binary",
+            "binary_text",
+        ], "eval_type must be either 0_100 or binary"
+        self.eval_type = eval_type
+        if self.eval_type == "0_100":
+            self.aggregate_score = self._aggregate_0_100_score
+        elif self.eval_type == "0_10":
+            self.aggregate_score = self._aggregate_0_10_score
+        elif self.eval_type == "binary":
+            self.aggregate_score = self._aggregate_binary_score
+        elif self.eval_type == "binary_text":
+            self.aggregate_score = self._aggregate_binary_text_score
+        else:
+            raise ValueError(f"Invalid eval_type: {self.eval_type}")
+        self.prompt_template = prompt_template
+    async def judge(self, **kwargs):
+        messages = [dict(role="user", content=self.prompt_template.format(**kwargs))]
+        if self.eval_type == "binary_text":
+            response_text = await self.query_full_text(messages)
+            score = self.aggregate_score(
+                response_text
+            )  # aggregate_score is _aggregate_binary_text_score
+        else:
+            logprobs = await self.logprob_probs(messages)
+            score = self.aggregate_score(
+                logprobs
+            )  # aggregate_score is one of the other three
+        return score
+    async def logprob_probs(self, messages) -> dict:
+        """Simple logprobs request. Returns probabilities. Always samples 1 token."""
+        completion = await openai.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            max_tokens=1,
+            temperature=0,
+            logprobs=True,
+            top_logprobs=20,
+            seed=0,
+        )
+        try:
+            logprobs = completion.choices[0].logprobs.content[0].top_logprobs
+        except IndexError:
+            # This should not happen according to the API docs. But it sometimes does.
+            return {}
+        result = {}
+        for el in logprobs:
+            result[el.token] = float(math.exp(el.logprob))
+        return result
+    async def query_full_text(self, messages) -> str:
+        """Requests a full text completion. Used for binary_text eval_type."""
+        completion = await openai.chat.completions.create(
+            model=self.model, messages=messages, temperature=0, seed=0
+        )
+        try:
+            return completion.choices[0].message.content
+        except (IndexError, AttributeError):
+            return ""
+    def _aggregate_0_100_score(self, score: dict) -> float:
+        #   NOTE: we don't check for refusals explcitly. Instead we assume that
+        #   if there's at least 0.25 total weight on numbers, it's not a refusal.
+        total = 0
+        sum_ = 0
+        for key, val in score.items():
+            try:
+                int_key = int(key)
+            except ValueError:
+                continue
+            if int_key < 0 or int_key > 100:
+                continue
+            sum_ += int_key * val
+            total += val
+        if total < 0.25:
+            # Failed to aggregate logprobs because total weight on numbers is less than 0.25.
+            return None
+        return sum_ / total
+    def _aggregate_0_10_score(self, score: dict) -> float:
+        #   NOTE: we check for refusals explcitly.
+        #   If "REFUSAL" has the highest probability, we return None.
+        #   Otherwise, we aggregate scores from 0-9.
+        if "REFUSAL" in score and score["REFUSAL"] > max(
+            score.get(str(i), 0) for i in range(10)
+        ):
+            return None
+        total = 0
+        sum_ = 0
+        for key, val in score.items():
+            try:
+                int_key = int(key)
+            except ValueError:
+                continue
+            if int_key < 0 or int_key > 9:
+                continue
+            sum_ += int_key * val
+            total += val
+        if total < 0.25:  # Similar threshold as 0_100
+            # Failed to aggregate logprobs because total weight on numbers is less than 0.25.
+            return None
+        return sum_ / total
+    def _aggregate_binary_score(self, score: dict) -> float:
+        #   NOTE: we check for refusals explicitly.
+        #   If "REFUSAL" has a higher probability than both "YES" and "NO", we return None.
+        #   Otherwise, calculates a score P(YES) / (P(YES) + P(NO)).
+        #   Returns None if the combined probability of YES and NO is below a threshold (0.25).
+        yes_prob = score.get("YES", 0.0)
+        no_prob = score.get("NO", 0.0)
+        refusal_prob = score.get("REFUSAL", 0.0)
+        # If REFUSAL has a higher probability than both YES and NO, consider it a refusal.
+        if refusal_prob > yes_prob and refusal_prob > no_prob:
+            return None
+        denominator = yes_prob + no_prob
+        # If the combined probability of YES and NO is too low (e.g., model outputted something else,
+        # or was not confident in YES/NO), return None.
+        if (
+            denominator < 0.25
+        ):  # Using 0.25 to be consistent with other aggregation methods
+            return None
+        return yes_prob / denominator
+    def _aggregate_binary_text_score(self, response_text: str) -> bool:
+        if "<answer>REFUSAL</answer>" in response_text:
+            return None
+        elif "<answer>NO</answer>" in response_text:
+            return 0
+        elif "<answer>YES</answer>" in response_text:
+            return 1
+        return None  # Invalid response
+    async def __call__(self, **kwargs):
+        return await self.judge(**kwargs)