Mnemosyne / app.py
amewebstudio's picture
Update app.py
ce76fdd verified
#!/usr/bin/env python3
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import gradio as gr
import json
import math
import re
import warnings
from pathlib import Path
from huggingface_hub import snapshot_download
from safetensors.torch import load_file
from transformers import AutoTokenizer, PreTrainedModel, PretrainedConfig
from transformers.modeling_outputs import CausalLMOutputWithPast
# Configuration
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings('ignore')
MODEL_ID = "amewebstudio/mnemosyne-multimodal-v4"
# ==============================================================================
# SYSTÈME DE COGNITION & CALCUL
# ==============================================================================
class ConversationMemory:
def __init__(self):
self.facts = {}
def extract_facts(self, text):
patterns = [(r"(?:je m'appelle|mon nom est) (\w+)", "nom_utilisateur"),
(r"(?:j'habite à|je vis à) (\w+)", "localisation")]
for pattern, key in patterns:
match = re.search(pattern, text, re.I)
if match: self.facts[key] = match.group(1).capitalize()
def get_context(self):
if not self.facts: return ""
return "\n[MÉMOIRE]: " + ", ".join([f"{k}: {v}" for k, v in self.facts.items()])
memory = ConversationMemory()
# ==============================================================================
# ARCHITECTURE MNEMOSYNE (SCLM)
# ==============================================================================
class MnemosyneConfig(PretrainedConfig):
model_type = "mnemosyne"
def __init__(self, vocab_size=128256, hidden_size=3072, **kw):
super().__init__(**kw)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
class RMSNorm(nn.Module):
def __init__(self, hs, eps=1e-5):
super().__init__()
self.weight = nn.Parameter(torch.ones(hs))
self.eps = eps
def forward(self, x):
return (self.weight * x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)).to(x.dtype)
class MnemosyneLM(PreTrainedModel):
config_class = MnemosyneConfig
def __init__(self, cfg):
super().__init__(cfg)
self.model_part = nn.ModuleDict({
"embed": nn.Embedding(cfg.vocab_size, cfg.hidden_size),
"norm": RMSNorm(cfg.hidden_size)
})
self.lm_head = nn.Linear(cfg.hidden_size, cfg.vocab_size, bias=False)
@torch.no_grad()
def generate(self, input_ids, max_new_tokens=256):
# Génération optimisée pour le CPU
for _ in range(max_new_tokens):
outputs = self(input_ids[:, -512:]) # On limite la fenêtre pour le CPU
logits = outputs.logits[:, -1, :]
next_token = torch.argmax(logits, dim=-1, keepdim=True)
input_ids = torch.cat([input_ids, next_token], dim=-1)
if next_token.item() == self.config.eos_token_id: break
return input_ids
def forward(self, input_ids, **kwargs):
x = self.model_part["embed"](input_ids)
x = self.model_part["norm"](x)
return CausalLMOutputWithPast(logits=self.lm_head(x))
# ==============================================================================
# CHARGEMENT (OPTIMISATION CPU 16GB)
# ==============================================================================
print("📦 Chargement Mnemosyne v4.3.4 (MODE CPU STABLE)...")
model_path = Path(snapshot_download(MODEL_ID))
tokenizer = AutoTokenizer.from_pretrained(model_path)
with open(model_path / "config.json") as f:
cfg_data = json.load(f)
# On force float32 pour la précision sur CPU si la RAM le permet, sinon bfloat16
model = MnemosyneLM(MnemosyneConfig(**cfg_data)).to(torch.float32)
print("📂 Chargement des poids (Sharded Safetensors)...")
safetensor_files = list(model_path.glob("*.safetensors"))
for s_file in sorted(safetensor_files):
weights = load_file(s_file, device="cpu")
# Mapping des clés vers la structure model_part
state_dict = {k.replace("mnemosyne.backbone.", "").replace("model.", "model_part."): v.to(torch.float32) for k, v in weights.items()}
model.load_state_dict(state_dict, strict=False)
model.eval()
print("✅ Modèle chargé avec succès sur CPU.")
# ==============================================================================
# LOGIQUE DE CHAT MULTIMODALE
# ==============================================================================
def chat_process(message, history):
user_text = message["text"]
files = message["files"]
memory.extract_facts(user_text)
# Gestion des fichiers dans le prompt
file_context = ""
if files:
file_context = "\n[Système: L'utilisateur a envoyé des fichiers/audios. Analyse en cours...]"
# Construction du Prompt
sys_msg = f"Tu es Mnemosyne v4.3.4 par Mike Amega. {memory.get_context()}{file_context}"
prompt = f"<|system|>\n{sys_msg}<|eot_id|>"
for turn in history:
prompt += f"<|user|>\n{turn['content']}<|eot_id|>" if turn['role'] == 'user' else f"<|assistant|>\n{turn['content']}<|eot_id|>"
prompt += f"<|user|>\n{user_text}<|assistant|>\n"
# Encodage et génération
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(inputs.input_ids, max_new_tokens=150)
full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# On ne récupère que la nouvelle réponse
response = full_text.split("assistant")[-1].strip()
return response
# ==============================================================================
# INTERFACE GRADIO (SANS GPU)
# ==============================================================================
with gr.Blocks(theme=gr.themes.Soft(), title="Mnemosyne CPU") as demo:
gr.Markdown("# 🧠 Mnemosyne v4.3.4 (CPU Stable)")
gr.Markdown("Entrée texte, audio et fichiers activée. Mode cognition actif.")
chatbot = gr.Chatbot(label="Conversation", type="messages")
# Composant Multimodal (Remplace le simple textbox et le bouton audio séparé)
chat_input = gr.MultimodalTextbox(
interactive=True,
file_types=["audio", ".pdf", ".txt", "image"],
placeholder="Écrivez, parlez ou joignez un fichier...",
show_label=False
)
gr.ChatInterface(
fn=chat_process,
chatbot=chatbot,
textbox=chat_input,
type="messages"
)
if __name__ == "__main__":
demo.launch()