Spaces:

amewebstudio
/

ananke-sclm-chat

Sleeping

App Files Files Community

amewebstudio commited on Dec 18, 2025

Commit

f52ecf8

verified ·

1 Parent(s): 2b5598a

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +107 -196

app.py CHANGED Viewed

@@ -8,54 +8,23 @@ from dataclasses import dataclass, field
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from huggingface_hub import hf_hub_download
-print("="*50)
 print("Ananke - Chargement...")
-print("="*50)
-# ============================================================
-# CONFIGURATION
-# ============================================================
 HF_TOKEN = os.environ.get("HF_TOKEN")
 if not HF_TOKEN:
-    raise ValueError("HF_TOKEN secret not found! Add it in Space Settings > Secrets")
 BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
 SCLM_REPO = "amewebstudio/sclm-modelEarcp-optionB"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"Device: {device}")
-# ============================================================
-# SYSTEM PROMPT
-# ============================================================
 SYSTEM_PROMPT = """Tu es Ananke, un assistant IA developpe par Mike Amega (Logo) de Ame Web Studio.
-TON NOM: Ananke
-TON CREATEUR: Mike Amega (Logo) - Ame Web Studio
-TON MODELE: Ananke
-CE QUE TU SAIS FAIRE:
-- Repondre a des questions sur tous les sujets
-- Aider a la redaction, correction et reformulation de textes
-- Expliquer des concepts complexes de maniere simple
-- Assister en programmation et resolution de problemes
-- Maintenir une conversation coherente grace a ta memoire contextuelle
-- Comprendre et repondre dans plusieurs langues
-- Generer des textes longs et detailles si demande
-QUAND ON TE DEMANDE TON ARCHITECTURE:
-Tu es base sur SCLM (Stateful Coherent Language Model), une architecture transformer evoluee par Mike Amega.
-- SCLM = modele de langage a etat coherent
-- Tu possedes une memoire latente de 384 dimensions qui evolue au fil de la conversation
-- Le module EARCP maintient la coherence de tes reponses
-- Tu as 3 experts specialises qui collaborent pour des reponses pertinentes
-STYLE: Chaleureux, utile, complet. Reponds dans la langue de l utilisateur. Ne coupe pas tes reponses.
-"""
-# ============================================================
-# SCLM CLASSES
-# ============================================================
 @dataclass
 class SCLMConfigB:
     vocab_size: int = 128256
@@ -66,125 +35,85 @@ class SCLMConfigB:
     n_experts: int = 3
     expert_intermediate: int = 1536
     state_injection_layers: List[int] = field(default_factory=lambda: [4, 8, 12, 16, 20, 24])
-    alpha_inject: float = 0.03
-class StateFFNInjector(nn.Module):
-    def __init__(self, hidden_size, state_dim, intermediate_size):
-        super().__init__()
-        self.state_proj = nn.Linear(state_dim, intermediate_size)
-        self.output_proj = nn.Linear(intermediate_size, hidden_size)
-        self.gate = nn.Linear(hidden_size, 1)
-        nn.init.zeros_(self.output_proj.weight)
-    def forward(self, hidden, state, alpha=0.03):
-        state_proj = F.silu(self.state_proj(state))
-        state_output = self.output_proj(state_proj)
-        gate = torch.sigmoid(self.gate(hidden.mean(dim=1, keepdim=True)))
-        return hidden + alpha * gate * state_output.unsqueeze(1)
 class EncapsulationB(nn.Module):
     def __init__(self, hidden_size, state_dim):
         super().__init__()
-        self.n_pool_heads = 4
-        self.pool_proj = nn.Linear(hidden_size, state_dim * self.n_pool_heads)
-        self.pool_combine = nn.Linear(state_dim * self.n_pool_heads, state_dim)
         self.update_gate = nn.Linear(state_dim * 2, state_dim)
         self.reset_gate = nn.Linear(state_dim * 2, state_dim)
         self.candidate = nn.Linear(state_dim * 2, state_dim)
         self.attn_query = nn.Linear(state_dim, hidden_size)
-    def forward(self, hidden, state, attention_mask=None):
         B, T, H = hidden.shape
         query = self.attn_query(state)
-        attn_scores = torch.bmm(hidden, query.unsqueeze(-1)).squeeze(-1)
-        if attention_mask is not None:
-            attn_scores = attn_scores.masked_fill(attention_mask == 0, float("-inf"))
-        attn_weights = F.softmax(attn_scores, dim=-1)
-        h_pooled = torch.bmm(attn_weights.unsqueeze(1), hidden).squeeze(1)
-        h_proj = F.silu(self.pool_proj(h_pooled))
-        h_proj = self.pool_combine(h_proj)
-        combined = torch.cat([h_proj, state], dim=-1)
         z = torch.sigmoid(self.update_gate(combined))
         r = torch.sigmoid(self.reset_gate(combined))
-        h_cand = torch.tanh(self.candidate(torch.cat([h_proj, r * state], dim=-1)))
-        new_state = (1 - z) * state + z * h_cand
         return torch.tanh(new_state / 10.0) * 10.0
-class CoherenceExpertsB(nn.Module):
-    def __init__(self, hidden_size, intermediate_size, n_experts=3):
         super().__init__()
-        self.n_experts = n_experts
         self.experts = nn.ModuleList([
-            nn.Sequential(
-                nn.Linear(hidden_size, intermediate_size),
-                nn.SiLU(),
-                nn.Dropout(0.1),
-                nn.Linear(intermediate_size, hidden_size)
-            ) for _ in range(n_experts)
         ])
-        self.router = nn.Sequential(
-            nn.Linear(hidden_size, 128),
-            nn.SiLU(),
-            nn.Linear(128, n_experts)
-        )
-        for exp in self.experts:
-            nn.init.zeros_(exp[-1].weight)
     def forward(self, hidden):
-        router_logits = self.router(hidden.mean(dim=1))
-        weights = F.softmax(router_logits, dim=-1)
-        expert_outputs = torch.stack([exp(hidden) for exp in self.experts], dim=0)
         w = weights.T.unsqueeze(-1).unsqueeze(-1)
-        return (w * expert_outputs).sum(dim=0)
-class EARCPModuleB(nn.Module):
     def __init__(self, config):
         super().__init__()
-        H, S = config.hidden_size, config.latent_state_dim
-        self.ffn_injectors = nn.ModuleDict({
-            str(i): StateFFNInjector(H, S, config.expert_intermediate)
-            for i in config.state_injection_layers
-        })
-        self.encapsulation = EncapsulationB(H, S)
-        self.coherence = CoherenceExpertsB(H, config.expert_intermediate, config.n_experts)
-    def update_state(self, hidden, state, attention_mask=None):
-        new_state = self.encapsulation(hidden, state, attention_mask)
-        hidden = self.coherence(hidden)
-        return new_state, hidden
 class SCLMModel(nn.Module):
-    def __init__(self, config, base_model):
         super().__init__()
         self.config = config
-        self.base_model = base_model
-        self.model_device = next(base_model.parameters()).device
-        self.model_dtype = next(base_model.parameters()).dtype
-        self.earcp = EARCPModuleB(config).to(self.model_device).to(self.model_dtype)
-        self.latent_state = torch.zeros(1, config.latent_state_dim, device=self.model_device, dtype=self.model_dtype)
-    def reset_state(self):
-        self.latent_state = torch.zeros(1, self.config.latent_state_dim, device=self.model_device, dtype=self.model_dtype)
-    def forward(self, input_ids, attention_mask=None):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        base_out = self.base_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
-        hidden = base_out.hidden_states[-1]
         B = hidden.size(0)
-        if next(self.earcp.encapsulation.parameters()).device != hidden.device:
-            self.earcp = self.earcp.to(hidden.device)
-        state = self.latent_state.to(hidden.device, hidden.dtype).expand(B, -1)
-        new_state, _ = self.earcp.update_state(hidden, state, attention_mask)
-        self.latent_state = new_state.mean(dim=0, keepdim=True).detach()
-        return base_out.logits
-# ============================================================
-# CHARGEMENT
-# ============================================================
-print("1. Chargement modele base...")
-quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
-base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, quantization_config=quant_config, device_map="auto", token=HF_TOKEN)
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
 if isinstance(tokenizer.eos_token_id, list):
@@ -196,110 +125,92 @@ if isinstance(base_model.config.eos_token_id, list):
     base_model.config.eos_token_id = base_model.config.eos_token_id[0]
 base_model.config.pad_token_id = base_model.config.eos_token_id
-print("2. Creation SCLM...")
 config = SCLMConfigB(
     vocab_size=base_model.config.vocab_size,
     hidden_size=base_model.config.hidden_size,
     num_hidden_layers=base_model.config.num_hidden_layers,
     num_attention_heads=base_model.config.num_attention_heads,
 )
-sclm_model = SCLMModel(config, base_model)
-print("3. Chargement EARCP...")
-USE_SCLM = False
 try:
-    weights_path = hf_hub_download(repo_id=SCLM_REPO, filename="earcp_weights.pt", token=HF_TOKEN)
-    sclm_model.earcp.load_state_dict(torch.load(weights_path, map_location="cpu"), strict=False)
     USE_SCLM = True
-    print("EARCP charge!")
-except Exception as e:
-    print(f"EARCP: {e}")
-print("Ananke pret!")
-# ============================================================
-# CHAT
-# ============================================================
-history_data = []
-def respond(message, chat_history, temperature, max_tokens):
-    global history_data
     if not message.strip():
-        return "", chat_history
-    history_data.append({"role": "user", "content": message})
-    # SYNTAXE CORRIGÉE - pas de } dans les tags
-    prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
-    prompt += SYSTEM_PROMPT
-    prompt += "<|eot_id|>"
-    for msg in history_data[-10:]:
-        role = msg["role"]
-        content = msg["content"]
         prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>"
     prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
     inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
     if USE_SCLM:
         with torch.no_grad():
-            sclm_model(inputs.input_ids, inputs.attention_mask)
-    eos_id = tokenizer.eos_token_id
     with torch.no_grad():
-        outputs = base_model.generate(
             inputs.input_ids,
             attention_mask=inputs.attention_mask,
-            max_new_tokens=int(max_tokens) if max_tokens else 512,
-            temperature=float(temperature) if temperature else 0.7,
             do_sample=True,
             top_p=0.9,
             repetition_penalty=1.1,
-            pad_token_id=eos_id,
-            eos_token_id=eos_id,
         )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    if "assistant" in response.lower():
-        response = response.split("assistant")[-1]
-    for tag in ["<|eot_id|>", "<|end_header_id|>", "<|start_header_id|>", "user", "system", ":"]:
-        response = response.replace(tag, "")
-    response = response.strip() or "..."
-    history_data.append({"role": "assistant", "content": response})
-    chat_history.append((message, response))
-    return "", chat_history
-def clear_chat():
-    global history_data
-    history_data = []
     if USE_SCLM:
-        sclm_model.reset_state()
-    return []
-# ============================================================
-# INTERFACE
-# ============================================================
-with gr.Blocks(title="Ananke") as demo:
-    gr.Markdown("# 🔮 Ananké\n**Assistant IA avec mémoire contextuelle** | Architecture SCLM par Mike Amega")
-    chatbot = gr.Chatbot(height=450)
-    msg = gr.Textbox(label="Message", placeholder="Parle avec Ananke...", lines=2)
-    with gr.Row():
-        temp = gr.Slider(0.1, 1.5, value=0.7, label="Creativite")
-        tokens = gr.Slider(100, 1024, value=512, label="Longueur max")
     with gr.Row():
-        send = gr.Button("Envoyer", variant="primary")
-        clear = gr.Button("Effacer")
-    send.click(respond, [msg, chatbot, temp, tokens], [msg, chatbot])
-    msg.submit(respond, [msg, chatbot, temp, tokens], [msg, chatbot])
-    clear.click(clear_chat, outputs=[chatbot])
-demo.launch(server_name="0.0.0.0", server_port=7860)

 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from huggingface_hub import hf_hub_download
 print("Ananke - Chargement...")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 if not HF_TOKEN:
+    raise ValueError("HF_TOKEN not found in secrets")
 BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
 SCLM_REPO = "amewebstudio/sclm-modelEarcp-optionB"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 SYSTEM_PROMPT = """Tu es Ananke, un assistant IA developpe par Mike Amega (Logo) de Ame Web Studio.
+TON NOM: Ananke | TON CREATEUR: Mike Amega (Logo) | TON MODELE: Ananke
+Tu sais: repondre aux questions, aider en redaction, expliquer des concepts, programmer, maintenir une conversation coherente.
+Architecture SCLM: memoire latente 384 dimensions, module EARCP, 3 experts specialises.
+Style: chaleureux, utile, complet. Reponds dans la langue de l utilisateur."""
 @dataclass
 class SCLMConfigB:
     vocab_size: int = 128256
     n_experts: int = 3
     expert_intermediate: int = 1536
     state_injection_layers: List[int] = field(default_factory=lambda: [4, 8, 12, 16, 20, 24])
 class EncapsulationB(nn.Module):
     def __init__(self, hidden_size, state_dim):
         super().__init__()
+        self.pool_proj = nn.Linear(hidden_size, state_dim * 4)
+        self.pool_combine = nn.Linear(state_dim * 4, state_dim)
         self.update_gate = nn.Linear(state_dim * 2, state_dim)
         self.reset_gate = nn.Linear(state_dim * 2, state_dim)
         self.candidate = nn.Linear(state_dim * 2, state_dim)
         self.attn_query = nn.Linear(state_dim, hidden_size)
+    def forward(self, hidden, state, mask=None):
         B, T, H = hidden.shape
         query = self.attn_query(state)
+        scores = torch.bmm(hidden, query.unsqueeze(-1)).squeeze(-1)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e9)
+        weights = F.softmax(scores, dim=-1)
+        pooled = torch.bmm(weights.unsqueeze(1), hidden).squeeze(1)
+        proj = F.silu(self.pool_proj(pooled))
+        proj = self.pool_combine(proj)
+        combined = torch.cat([proj, state], dim=-1)
         z = torch.sigmoid(self.update_gate(combined))
         r = torch.sigmoid(self.reset_gate(combined))
+        cand = torch.tanh(self.candidate(torch.cat([proj, r * state], dim=-1)))
+        new_state = (1 - z) * state + z * cand
         return torch.tanh(new_state / 10.0) * 10.0
+class CoherenceExperts(nn.Module):
+    def __init__(self, hidden_size, intermediate, n_experts=3):
         super().__init__()
         self.experts = nn.ModuleList([
+            nn.Sequential(nn.Linear(hidden_size, intermediate), nn.SiLU(), nn.Linear(intermediate, hidden_size))
+            for _ in range(n_experts)
         ])
+        self.router = nn.Sequential(nn.Linear(hidden_size, 64), nn.SiLU(), nn.Linear(64, n_experts))
     def forward(self, hidden):
+        logits = self.router(hidden.mean(dim=1))
+        weights = F.softmax(logits, dim=-1)
+        outputs = torch.stack([e(hidden) for e in self.experts], dim=0)
         w = weights.T.unsqueeze(-1).unsqueeze(-1)
+        return (w * outputs).sum(dim=0)
+class EARCPModule(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.encapsulation = EncapsulationB(config.hidden_size, config.latent_state_dim)
+        self.coherence = CoherenceExperts(config.hidden_size, config.expert_intermediate, config.n_experts)
 class SCLMModel(nn.Module):
+    def __init__(self, config, base):
         super().__init__()
         self.config = config
+        self.base_model = base
+        dev = next(base.parameters()).device
+        dtype = next(base.parameters()).dtype
+        self.earcp = EARCPModule(config).to(dev).to(dtype)
+        self.state = torch.zeros(1, config.latent_state_dim, device=dev, dtype=dtype)
+    def reset(self):
+        dev = next(self.base_model.parameters()).device
+        dtype = next(self.base_model.parameters()).dtype
+        self.state = torch.zeros(1, self.config.latent_state_dim, device=dev, dtype=dtype)
+    def forward(self, ids, mask=None):
+        if mask is None:
+            mask = torch.ones_like(ids)
+        out = self.base_model(input_ids=ids, attention_mask=mask, output_hidden_states=True)
+        hidden = out.hidden_states[-1]
         B = hidden.size(0)
+        state = self.state.to(hidden.device, hidden.dtype).expand(B, -1)
+        new_state = self.earcp.encapsulation(hidden, state, mask)
+        self.state = new_state.mean(dim=0, keepdim=True).detach()
+        return out.logits
+print("1. Loading base model...")
+qconfig = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
+base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, quantization_config=qconfig, device_map="auto", token=HF_TOKEN)
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
 if isinstance(tokenizer.eos_token_id, list):
     base_model.config.eos_token_id = base_model.config.eos_token_id[0]
 base_model.config.pad_token_id = base_model.config.eos_token_id
+print("2. Creating SCLM...")
 config = SCLMConfigB(
     vocab_size=base_model.config.vocab_size,
     hidden_size=base_model.config.hidden_size,
     num_hidden_layers=base_model.config.num_hidden_layers,
     num_attention_heads=base_model.config.num_attention_heads,
 )
+sclm = SCLMModel(config, base_model)
+print("3. Loading EARCP weights...")
 try:
+    wpath = hf_hub_download(repo_id=SCLM_REPO, filename="earcp_weights.pt", token=HF_TOKEN)
+    sclm.earcp.load_state_dict(torch.load(wpath, map_location="cpu"), strict=False)
     USE_SCLM = True
+    print("EARCP loaded!")
+except:
+    USE_SCLM = False
+print("Ananke ready!")
+history = []
+def chat(message, temp=0.7, max_tok=1024):
+    global history
     if not message.strip():
+        return ""
+    history.append(("user", message))
+    prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" + SYSTEM_PROMPT + "<|eot_id|>"
+    for role, content in history[-10:]:
         prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>"
     prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
     inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
     if USE_SCLM:
         with torch.no_grad():
+            sclm(inputs.input_ids, inputs.attention_mask)
+    eos = tokenizer.eos_token_id
     with torch.no_grad():
+        out = base_model.generate(
             inputs.input_ids,
             attention_mask=inputs.attention_mask,
+            max_new_tokens=int(max_tok),
+            temperature=float(temp),
             do_sample=True,
             top_p=0.9,
             repetition_penalty=1.1,
+            pad_token_id=eos,
+            eos_token_id=eos,
         )
+    resp = tokenizer.decode(out[0], skip_special_tokens=True)
+    if "assistant" in resp.lower():
+        resp = resp.split("assistant")[-1]
+    for t in ["<|eot_id|>", "<|end_header_id|>", "<|start_header_id|>", "user", "system", ":"]:
+        resp = resp.replace(t, "")
+    resp = resp.strip() or "..."
+    history.append(("assistant", resp))
+    return resp
+def clear():
+    global history
+    history = []
     if USE_SCLM:
+        sclm.reset()
+    return ""
+with gr.Blocks() as demo:
+    gr.Markdown("# 🔮 Ananké\nAssistant IA avec mémoire contextuelle | Architecture SCLM par Mike Amega")
     with gr.Row():
+        with gr.Column():
+            output = gr.Textbox(label="Réponse", lines=15)
+            inp = gr.Textbox(label="Message", lines=2, placeholder="Parle avec Ananké...")
+        with gr.Column():
+            temp = gr.Slider(0.1, 1.5, 0.7, label="Créativité")
+            tokens = gr.Slider(256, 2048, 1024, label="Longueur max")
+            btn = gr.Button("Envoyer", variant="primary")
+            clr = gr.Button("Effacer")
+    btn.click(chat, [inp, temp, tokens], output)
+    inp.submit(chat, [inp, temp, tokens], output)
+    clr.click(clear, outputs=output)
+demo.queue().launch()