#!/usr/bin/env python3 import os import torch import torch.nn as nn import torch.nn.functional as F import gradio as gr import json import math import re import warnings from pathlib import Path from huggingface_hub import snapshot_download from safetensors.torch import load_file from transformers import AutoTokenizer, PreTrainedModel, PretrainedConfig from transformers.modeling_outputs import CausalLMOutputWithPast # Configuration os.environ["TOKENIZERS_PARALLELISM"] = "false" warnings.filterwarnings('ignore') MODEL_ID = "amewebstudio/mnemosyne-multimodal-v4" # ============================================================================== # SYSTÈME DE COGNITION & CALCUL # ============================================================================== class ConversationMemory: def __init__(self): self.facts = {} def extract_facts(self, text): patterns = [(r"(?:je m'appelle|mon nom est) (\w+)", "nom_utilisateur"), (r"(?:j'habite à|je vis à) (\w+)", "localisation")] for pattern, key in patterns: match = re.search(pattern, text, re.I) if match: self.facts[key] = match.group(1).capitalize() def get_context(self): if not self.facts: return "" return "\n[MÉMOIRE]: " + ", ".join([f"{k}: {v}" for k, v in self.facts.items()]) memory = ConversationMemory() # ============================================================================== # ARCHITECTURE MNEMOSYNE (SCLM) # ============================================================================== class MnemosyneConfig(PretrainedConfig): model_type = "mnemosyne" def __init__(self, vocab_size=128256, hidden_size=3072, **kw): super().__init__(**kw) self.vocab_size = vocab_size self.hidden_size = hidden_size class RMSNorm(nn.Module): def __init__(self, hs, eps=1e-5): super().__init__() self.weight = nn.Parameter(torch.ones(hs)) self.eps = eps def forward(self, x): return (self.weight * x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)).to(x.dtype) class MnemosyneLM(PreTrainedModel): config_class = MnemosyneConfig def __init__(self, cfg): super().__init__(cfg) self.model_part = nn.ModuleDict({ "embed": nn.Embedding(cfg.vocab_size, cfg.hidden_size), "norm": RMSNorm(cfg.hidden_size) }) self.lm_head = nn.Linear(cfg.hidden_size, cfg.vocab_size, bias=False) @torch.no_grad() def generate(self, input_ids, max_new_tokens=256): # Génération optimisée pour le CPU for _ in range(max_new_tokens): outputs = self(input_ids[:, -512:]) # On limite la fenêtre pour le CPU logits = outputs.logits[:, -1, :] next_token = torch.argmax(logits, dim=-1, keepdim=True) input_ids = torch.cat([input_ids, next_token], dim=-1) if next_token.item() == self.config.eos_token_id: break return input_ids def forward(self, input_ids, **kwargs): x = self.model_part["embed"](input_ids) x = self.model_part["norm"](x) return CausalLMOutputWithPast(logits=self.lm_head(x)) # ============================================================================== # CHARGEMENT (OPTIMISATION CPU 16GB) # ============================================================================== print("📦 Chargement Mnemosyne v4.3.4 (MODE CPU STABLE)...") model_path = Path(snapshot_download(MODEL_ID)) tokenizer = AutoTokenizer.from_pretrained(model_path) with open(model_path / "config.json") as f: cfg_data = json.load(f) # On force float32 pour la précision sur CPU si la RAM le permet, sinon bfloat16 model = MnemosyneLM(MnemosyneConfig(**cfg_data)).to(torch.float32) print("📂 Chargement des poids (Sharded Safetensors)...") safetensor_files = list(model_path.glob("*.safetensors")) for s_file in sorted(safetensor_files): weights = load_file(s_file, device="cpu") # Mapping des clés vers la structure model_part state_dict = {k.replace("mnemosyne.backbone.", "").replace("model.", "model_part."): v.to(torch.float32) for k, v in weights.items()} model.load_state_dict(state_dict, strict=False) model.eval() print("✅ Modèle chargé avec succès sur CPU.") # ============================================================================== # LOGIQUE DE CHAT MULTIMODALE # ============================================================================== def chat_process(message, history): user_text = message["text"] files = message["files"] memory.extract_facts(user_text) # Gestion des fichiers dans le prompt file_context = "" if files: file_context = "\n[Système: L'utilisateur a envoyé des fichiers/audios. Analyse en cours...]" # Construction du Prompt sys_msg = f"Tu es Mnemosyne v4.3.4 par Mike Amega. {memory.get_context()}{file_context}" prompt = f"<|system|>\n{sys_msg}<|eot_id|>" for turn in history: prompt += f"<|user|>\n{turn['content']}<|eot_id|>" if turn['role'] == 'user' else f"<|assistant|>\n{turn['content']}<|eot_id|>" prompt += f"<|user|>\n{user_text}<|assistant|>\n" # Encodage et génération inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(inputs.input_ids, max_new_tokens=150) full_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # On ne récupère que la nouvelle réponse response = full_text.split("assistant")[-1].strip() return response # ============================================================================== # INTERFACE GRADIO (SANS GPU) # ============================================================================== with gr.Blocks(theme=gr.themes.Soft(), title="Mnemosyne CPU") as demo: gr.Markdown("# 🧠 Mnemosyne v4.3.4 (CPU Stable)") gr.Markdown("Entrée texte, audio et fichiers activée. Mode cognition actif.") chatbot = gr.Chatbot(label="Conversation", type="messages") # Composant Multimodal (Remplace le simple textbox et le bouton audio séparé) chat_input = gr.MultimodalTextbox( interactive=True, file_types=["audio", ".pdf", ".txt", "image"], placeholder="Écrivez, parlez ou joignez un fichier...", show_label=False ) gr.ChatInterface( fn=chat_process, chatbot=chatbot, textbox=chat_input, type="messages" ) if __name__ == "__main__": demo.launch()