Spaces:

iteratehack
/

code-19

Sleeping

App Files Files Community

PiotrPasztor commited on Nov 30, 2025

Commit

a40c8da

1 Parent(s): bdb9784

asd

Browse files

Files changed (3) hide show

__pycache__/app.cpython-314.pyc +0 -0
app.py +396 -47
dataset.jsonl +0 -0

__pycache__/app.cpython-314.pyc ADDED Viewed

Binary file (25.5 kB). View file

app.py CHANGED Viewed

@@ -1,19 +1,34 @@
 import torch
 import torch.nn as nn
 import json
 import os
 from fastapi import FastAPI
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModel
-# Simple RL Classifier using Transformer
-ACTIONS = ["TRIP", "NONE", "GITHUB", "MAIL"]
 DATASET_PATH = os.path.join(os.path.dirname(__file__), "dataset.jsonl")
 app = FastAPI()
-# Global model state - loaded lazily
-model_state = {"ready": False, "tokenizer": None, "encoder": None, "policy_head": None}
 class MessageRequest(BaseModel):
@@ -25,69 +40,402 @@ class ActionResponse(BaseModel):
     score: float
-@app.get("/health")
-def health():
-    return {"status": "ok", "model_ready": model_state["ready"]}
-def load_model():
-    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-    encoder = AutoModel.from_pretrained("distilbert-base-uncased")
-    # Simple policy head
-    policy_head = nn.Linear(768, len(ACTIONS))
-    # Load dataset for training
     data = []
     with open(DATASET_PATH, "r") as f:
         for line in f:
             item = json.loads(line)
             user_msg = item["messages"][1]["content"]
             label = item["messages"][2]["content"]
-            data.append((user_msg, ACTIONS.index(label)))
-    # Quick RL-style training (policy gradient simplified)
-    optimizer = torch.optim.Adam(policy_head.parameters(), lr=1e-3)
-    encoder.eval()
-    for epoch in range(3):
-        total_reward = 0
-        for text, label in data[:100]:  # use subset for speed
-            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64)
             with torch.no_grad():
-                hidden = encoder(**inputs).last_hidden_state[:, 0, :]
-            logits = policy_head(hidden)
-            probs = torch.softmax(logits, dim=-1)
-            # Sample action (RL style)
-            action = torch.multinomial(probs, 1).item()
-            # Reward: +1 if correct, -1 if wrong
-            reward = 1.0 if action == label else -1.0
-            total_reward += reward
-            # Policy gradient update
-            log_prob = torch.log(probs[0, action])
-            loss = -log_prob * reward
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-    return tokenizer, encoder, policy_head
-def predict(text, tokenizer, encoder, policy_head):
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64)
     with torch.no_grad():
         hidden = encoder(**inputs).last_hidden_state[:, 0, :]
-        logits = policy_head(hidden)
-        probs = torch.softmax(logits, dim=-1)
-        action_idx = torch.argmax(probs, dim=-1).item()
-        score = probs[0, action_idx].item()
-    return ACTIONS[action_idx], score
 @app.on_event("startup")
@@ -95,14 +443,14 @@ async def startup_event():
     import threading
     def load_in_background():
-        tokenizer, encoder, policy_head = load_model()
         model_state["tokenizer"] = tokenizer
         model_state["encoder"] = encoder
-        model_state["policy_head"] = policy_head
         model_state["ready"] = True
-        print("Model loaded and ready!")
-    # Load model in background thread so server can respond immediately
     thread = threading.Thread(target=load_in_background)
     thread.start()
@@ -117,6 +465,7 @@ def action(request: MessageRequest):
         request.message,
         model_state["tokenizer"],
         model_state["encoder"],
-        model_state["policy_head"]
     )
     return ActionResponse(action=action_name, score=round(score, 4))

 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import json
 import os
+import random
+import numpy as np
+from collections import deque
 from fastapi import FastAPI
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModel
+ACTIONS = ["TRIP", "GITHUB", "MAIL"]
+NUM_ACTIONS = len(ACTIONS)
 DATASET_PATH = os.path.join(os.path.dirname(__file__), "dataset.jsonl")
+# Confidence threshold - below this returns NONE
+CONFIDENCE_THRESHOLD = 0.6
+# Distance threshold for outlier detection (cosine similarity)
+DISTANCE_THRESHOLD = 0.93
 app = FastAPI()
+model_state = {
+    "ready": False,
+    "agent": None,
+    "tokenizer": None,
+    "encoder": None,
+    "class_centroids": None,  # Mean embeddings per class
+}
 class MessageRequest(BaseModel):
     score: float
+class PolicyNetwork(nn.Module):
+    """Policy network that outputs action probabilities."""
+    def __init__(self, state_dim, num_actions, hidden_dim=128):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(state_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_dim, num_actions)
+        )
+        # Initialize last layer with small weights for balanced initial policy
+        nn.init.xavier_uniform_(self.net[-1].weight, gain=0.01)
+        nn.init.zeros_(self.net[-1].bias)
+    def forward(self, state):
+        return self.net(state)
+    def get_action_probs(self, state):
+        logits = self.forward(state)
+        return F.softmax(logits, dim=-1)
+    def get_action(self, state, deterministic=False, temperature=1.0):
+        logits = self.forward(state)
+        # Apply temperature for exploration control
+        scaled_logits = logits / temperature
+        probs = F.softmax(scaled_logits, dim=-1)
+        if deterministic:
+            action = torch.argmax(probs, dim=-1)
+        else:
+            dist = torch.distributions.Categorical(probs)
+            action = dist.sample()
+        return action, probs
+class QNetwork(nn.Module):
+    """Q-Network for action-value estimation."""
+    def __init__(self, state_dim, num_actions, hidden_dim=128):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(state_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, num_actions)
+        )
+    def forward(self, state):
+        return self.net(state)
+class RLAgent:
+    """
+    RL Agent using Double DQN with proper exploration.
+    """
+    def __init__(self, state_dim, num_actions, lr=1e-3, gamma=0.95):
+        self.state_dim = state_dim
+        self.num_actions = num_actions
+        self.gamma = gamma
+        # Q-Networks (Double DQN)
+        self.q_net = QNetwork(state_dim, num_actions)
+        self.target_q_net = QNetwork(state_dim, num_actions)
+        self.target_q_net.load_state_dict(self.q_net.state_dict())
+        # Policy network
+        self.policy_net = PolicyNetwork(state_dim, num_actions)
+        self.q_optimizer = torch.optim.AdamW(self.q_net.parameters(), lr=lr, weight_decay=1e-4)
+        self.policy_optimizer = torch.optim.AdamW(self.policy_net.parameters(), lr=lr, weight_decay=1e-4)
+        # Exploration parameters
+        self.epsilon = 1.0
+        self.epsilon_min = 0.05
+        self.epsilon_decay = 0.995
+        self.temperature = 1.0
+    def select_action(self, state, deterministic=True):
+        """Select action given state."""
+        with torch.no_grad():
+            if deterministic:
+                # Use policy network for inference
+                action, probs = self.policy_net.get_action(state, deterministic=True)
+                action_idx = action.item()
+                # Use entropy-based confidence: high entropy = low confidence
+                entropy = -(probs * torch.log(probs + 1e-8)).sum(dim=-1).item()
+                max_entropy = np.log(self.num_actions)  # Maximum possible entropy
+                # Confidence based on how certain the distribution is
+                # Low entropy = high confidence, high entropy = low confidence
+                confidence = 1.0 - (entropy / max_entropy)
+                # Also factor in the raw probability
+                raw_prob = probs[0, action_idx].item()
+                confidence = confidence * raw_prob
+            else:
+                # Epsilon-greedy for training
+                if random.random() < self.epsilon:
+                    action_idx = random.randint(0, self.num_actions - 1)
+                    confidence = 1.0 / self.num_actions
+                else:
+                    action, probs = self.policy_net.get_action(state, deterministic=False, temperature=self.temperature)
+                    action_idx = action.item()
+                    confidence = probs[0, action_idx].item()
+        return action_idx, confidence
+    def update_q(self, states, actions, rewards, next_states, dones):
+        """Update Q-network using TD learning."""
+        # Current Q values
+        q_values = self.q_net(states)
+        q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
+        # Target Q values (Double DQN)
+        with torch.no_grad():
+            # Select best action using online network
+            next_q_online = self.q_net(next_states)
+            best_actions = next_q_online.argmax(dim=1)
+            # Evaluate using target network
+            next_q_target = self.target_q_net(next_states)
+            next_q_values = next_q_target.gather(1, best_actions.unsqueeze(1)).squeeze(1)
+            target_q_values = rewards + self.gamma * next_q_values * (1 - dones)
+        # Q-network loss
+        q_loss = F.smooth_l1_loss(q_values, target_q_values)
+        self.q_optimizer.zero_grad()
+        q_loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.q_net.parameters(), 1.0)
+        self.q_optimizer.step()
+        return q_loss.item()
+    def update_policy(self, states, actions):
+        """Update policy network to match Q-values (actor-critic style)."""
+        # Get Q-values for actions
+        with torch.no_grad():
+            q_values = self.q_net(states)
+            # Advantage = Q(s,a) - V(s), where V(s) = E[Q(s,a)]
+            v_values = q_values.mean(dim=1, keepdim=True)
+            advantages = q_values - v_values
+        # Policy logits
+        logits = self.policy_net(states)
+        log_probs = F.log_softmax(logits, dim=-1)
+        # Policy loss: maximize advantage-weighted log probability
+        action_log_probs = log_probs.gather(1, actions.unsqueeze(1)).squeeze(1)
+        action_advantages = advantages.gather(1, actions.unsqueeze(1)).squeeze(1)
+        # Add entropy bonus for exploration
+        probs = F.softmax(logits, dim=-1)
+        entropy = -(probs * log_probs).sum(dim=-1).mean()
+        policy_loss = -(action_log_probs * action_advantages.detach()).mean() - 0.05 * entropy
+        self.policy_optimizer.zero_grad()
+        policy_loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
+        self.policy_optimizer.step()
+        return policy_loss.item()
+    def update_target_network(self, tau=0.005):
+        """Soft update target network."""
+        for target_param, param in zip(self.target_q_net.parameters(), self.q_net.parameters()):
+            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
+    def decay_exploration(self):
+        """Decay exploration parameters."""
+        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
+def load_dataset():
+    """Load and parse the dataset."""
     data = []
     with open(DATASET_PATH, "r") as f:
         for line in f:
             item = json.loads(line)
             user_msg = item["messages"][1]["content"]
             label = item["messages"][2]["content"]
+            if label in ACTIONS:
+                data.append((user_msg, ACTIONS.index(label)))
+    random.shuffle(data)
+    return data
+def encode_texts(texts, tokenizer, encoder):
+    """Batch encode texts to state representations."""
+    inputs = tokenizer(texts, return_tensors="pt", truncation=True, max_length=64, padding=True)
+    with torch.no_grad():
+        hidden = encoder(**inputs).last_hidden_state[:, 0, :]
+    return hidden
+def train_rl_agent(tokenizer, encoder, data, num_epochs=50, batch_size=64):
+    """
+    Train RL agent using offline RL on dataset.
+    Uses the dataset as demonstration data:
+    - States: encoded text messages
+    - Actions: correct labels from dataset (expert demonstrations)
+    - Rewards: +1 for correct, -1 for incorrect
+    """
+    state_dim = 768  # DistilBERT hidden size
+    agent = RLAgent(state_dim, NUM_ACTIONS, lr=3e-4)
+    print("Encoding all dataset examples...")
+    # Pre-encode all texts for efficiency
+    all_texts = [text for text, _ in data]
+    all_labels = [label for _, label in data]
+    # Encode in batches
+    all_states = []
+    for i in range(0, len(all_texts), batch_size):
+        batch_texts = all_texts[i:i+batch_size]
+        batch_states = encode_texts(batch_texts, tokenizer, encoder)
+        all_states.append(batch_states)
+    all_states = torch.cat(all_states, dim=0)
+    all_labels = torch.tensor(all_labels, dtype=torch.long)
+    print(f"Encoded {len(all_states)} examples")
+    # Print class distribution
+    for i, action_name in enumerate(ACTIONS):
+        count = (all_labels == i).sum().item()
+        print(f"  {action_name}: {count} examples")
+    # Create next states (shifted by 1, with wraparound)
+    indices = torch.randperm(len(all_states))
+    next_states = all_states[indices]
+    print("Starting RL training...")
+    for epoch in range(num_epochs):
+        # Shuffle data each epoch
+        perm = torch.randperm(len(all_states))
+        states_shuffled = all_states[perm]
+        labels_shuffled = all_labels[perm]
+        next_states_shuffled = next_states[perm]
+        epoch_q_loss = 0
+        epoch_policy_loss = 0
+        num_batches = 0
+        for i in range(0, len(states_shuffled), batch_size):
+            batch_states = states_shuffled[i:i+batch_size]
+            batch_labels = labels_shuffled[i:i+batch_size]
+            batch_next_states = next_states_shuffled[i:i+batch_size]
+            # Simple rewards: +1 for correct, -1 for wrong
+            batch_rewards = torch.ones(len(batch_labels), dtype=torch.float32)
+            batch_dones = torch.zeros(len(batch_labels), dtype=torch.float32)
+            # Add negative examples (wrong actions with negative reward)
+            wrong_actions_list = []
+            for label in batch_labels:
+                wrong = (label.item() + random.randint(1, NUM_ACTIONS - 1)) % NUM_ACTIONS
+                wrong_actions_list.append(wrong)
+            wrong_actions = torch.tensor(wrong_actions_list, dtype=torch.long)
+            wrong_rewards = -torch.ones(len(batch_labels), dtype=torch.float32)
+            # Combine correct and incorrect transitions
+            combined_states = torch.cat([batch_states, batch_states], dim=0)
+            combined_actions = torch.cat([batch_labels, wrong_actions], dim=0)
+            combined_rewards = torch.cat([batch_rewards, wrong_rewards], dim=0)
+            combined_next_states = torch.cat([batch_next_states, batch_next_states], dim=0)
+            combined_dones = torch.cat([batch_dones, batch_dones], dim=0)
+            # Update Q-network
+            q_loss = agent.update_q(
+                combined_states, combined_actions, combined_rewards,
+                combined_next_states, combined_dones
+            )
+            # Update policy (only on correct examples)
+            policy_loss = agent.update_policy(batch_states, batch_labels)
+            # Soft update target
+            agent.update_target_network(tau=0.005)
+            epoch_q_loss += q_loss
+            epoch_policy_loss += policy_loss
+            num_batches += 1
+        agent.decay_exploration()
+        if (epoch + 1) % 10 == 0:
+            # Evaluate
             with torch.no_grad():
+                _, probs = agent.policy_net.get_action(all_states, deterministic=True)
+                predictions = probs.argmax(dim=-1)
+                accuracy = (predictions == all_labels).float().mean().item() * 100
+                # Check policy entropy (diversity)
+                avg_entropy = -(probs * torch.log(probs + 1e-8)).sum(dim=-1).mean().item()
+            print(f"Epoch {epoch + 1}/{num_epochs} | "
+                  f"Q-Loss: {epoch_q_loss/num_batches:.4f} | "
+                  f"Policy-Loss: {epoch_policy_loss/num_batches:.4f} | "
+                  f"Accuracy: {accuracy:.1f}% | "
+                  f"Entropy: {avg_entropy:.3f} | "
+                  f"Epsilon: {agent.epsilon:.3f}")
+    # Set networks to eval mode (disables dropout for deterministic inference)
+    agent.policy_net.eval()
+    agent.q_net.eval()
+    # Final evaluation
+    print("\nFinal Evaluation:")
+    with torch.no_grad():
+        _, probs = agent.policy_net.get_action(all_states, deterministic=True)
+        predictions = probs.argmax(dim=-1)
+        for i, action_name in enumerate(ACTIONS):
+            mask = all_labels == i
+            if mask.sum() > 0:
+                action_acc = (predictions[mask] == i).float().mean().item() * 100
+                print(f"  {action_name}: {action_acc:.1f}% ({mask.sum().item()} samples)")
+        overall_acc = (predictions == all_labels).float().mean().item() * 100
+        print(f"  Overall: {overall_acc:.1f}%")
+    # Compute class centroids for outlier detection
+    print("\nComputing class centroids...")
+    centroids = []
+    for i in range(NUM_ACTIONS):
+        mask = all_labels == i
+        class_states = all_states[mask]
+        centroid = class_states.mean(dim=0)
+        centroids.append(centroid)
+    class_centroids = torch.stack(centroids)
+    return agent, class_centroids
+def load_model():
+    """Load encoder and train RL agent."""
+    print("Loading tokenizer and encoder...")
+    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+    encoder = AutoModel.from_pretrained("distilbert-base-uncased")
+    encoder.eval()
+    print("Loading dataset...")
+    data = load_dataset()
+    print(f"Dataset size: {len(data)} examples")
+    print("Training RL agent...")
+    agent, class_centroids = train_rl_agent(tokenizer, encoder, data)
+    return tokenizer, encoder, agent, class_centroids
+def predict(text, tokenizer, encoder, agent, class_centroids):
+    """Use trained RL agent to predict action for given text."""
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64)
     with torch.no_grad():
         hidden = encoder(**inputs).last_hidden_state[:, 0, :]
+        action_idx, confidence = agent.select_action(hidden, deterministic=True)
+        # Compute cosine similarity to closest class centroid
+        hidden_norm = hidden / hidden.norm(dim=-1, keepdim=True)
+        centroids_norm = class_centroids / class_centroids.norm(dim=-1, keepdim=True)
+        similarities = torch.mm(hidden_norm, centroids_norm.t()).squeeze(0)
+        max_similarity = similarities.max().item()
+    # Return NONE if similarity is too low OR confidence is too low
+    if max_similarity < DISTANCE_THRESHOLD or confidence < CONFIDENCE_THRESHOLD:
+        return "NONE", confidence
+    return ACTIONS[action_idx], confidence
+@app.get("/health")
+def health():
+    return {"status": "ok", "model_ready": model_state["ready"]}
 @app.on_event("startup")
     import threading
     def load_in_background():
+        tokenizer, encoder, agent, class_centroids = load_model()
         model_state["tokenizer"] = tokenizer
         model_state["encoder"] = encoder
+        model_state["agent"] = agent
+        model_state["class_centroids"] = class_centroids
         model_state["ready"] = True
+        print("RL Agent loaded and ready!")
     thread = threading.Thread(target=load_in_background)
     thread.start()
         request.message,
         model_state["tokenizer"],
         model_state["encoder"],
+        model_state["agent"],
+        model_state["class_centroids"]
     )
     return ActionResponse(action=action_name, score=round(score, 4))

dataset.jsonl CHANGED Viewed

The diff for this file is too large to render. See raw diff