LifeLog / models.py
arunsa's picture
Fix MiniCPM-V loading: patch missing all_tied_weights_keys on PreTrainedModel
d297e4a
Raw
History Blame Contribute Delete
8.57 kB
import os
import json
from PIL import Image, ImageDraw, ImageFont
DEMO_MODE = os.environ.get("LIFELOG_DEMO", "0") == "1"
# Model IDs — swap here if needed
MODEL_TEXT = "openbmb/MiniCPM5-1B"
MODEL_VISION = "openbmb/MiniCPM-V-2_6"
MODEL_ASR = "openai/whisper-small"
MODEL_IMAGE = "black-forest-labs/FLUX.1-schnell"
def _gpu_decorator(duration=60):
try:
import spaces
return spaces.GPU(duration=duration)
except ImportError:
return lambda fn: fn
# ---------------------------------------------------------------------------
# Demo-mode mock data
# ---------------------------------------------------------------------------
_DEMO_FOLLOW_UPS = [
(
"That's a significant decision. What was the specific moment or event "
"that tipped the scales? Was there a single trigger, or has this been "
"building for a while?"
),
(
"I see. Let's stress-test this — what does the absolute worst-case "
"scenario look like if this doesn't work out? And on the flip side, "
"what's the best realistic outcome in six months?"
),
(
"Last question — who else is affected by this change? Are there "
"dependencies you need to manage — people counting on the old "
"arrangement, or opportunities blocked until this ships?"
),
]
_DEMO_CATEGORIZE = json.dumps({
"category": "career",
"subcategory": "job_change",
"severity": 7,
"status_emoji": "🔧",
})
_DEMO_PREDICT = json.dumps([
{
"outcome": "Short-term financial pressure during the transition",
"probability": "high",
"valence": "negative",
"timeframe": "months",
},
{
"outcome": "New growth opportunities and skill development",
"probability": "medium",
"valence": "positive",
"timeframe": "months",
},
{
"outcome": "Stress and uncertainty while adjusting",
"probability": "high",
"valence": "negative",
"timeframe": "weeks",
},
{
"outcome": "Improved long-term career satisfaction",
"probability": "medium",
"valence": "positive",
"timeframe": "years",
},
])
_DEMO_CARD_PROMPT = (
"A solitary figure standing at a crossroads in soft watercolor, one path "
"leading through a dense forest, the other opening to a sunlit meadow, "
"warm amber light breaking through clouds overhead"
)
_DEMO_IMAGE_DESC = (
"This appears to be a formal document with professional letterhead. "
"The key information suggests important correspondence regarding a "
"significant life decision or career change."
)
_DEMO_PATTERN = """\
## 🔍 Debug Report: Life Pattern Analysis
### Recurring Patterns
- You tend to make major decisions after prolonged periods of dissatisfaction \
rather than proactively.
- Career decisions show a pattern of choosing growth over stability.
- You process decisions emotionally first, then rationalize afterward.
### Category Distribution
Decisions are heavily weighted toward career (60%) with relationship decisions \
as the second most common (20%). Work is your primary source of both \
satisfaction and stress.
### Prediction Accuracy
Based on resolved decisions, predictions are ~65% accurate. You tend to \
overestimate negative outcomes and underestimate how quickly you adapt.
### Risk Profile
**Moderate risk-taker.** You avoid purely speculative decisions but accept \
significant uncertainty when the upside is clear.
### 🔧 Recommended Patch
Add a 72-hour cool-down for decisions with severity > 6. Your first instincts \
are usually good, but stress-testing them before they ship to production would \
catch edge cases."""
# ---------------------------------------------------------------------------
# Model loading (skipped in demo mode)
# ---------------------------------------------------------------------------
text_model = None
text_tokenizer = None
asr_pipe = None
vision_model = None
vision_tokenizer = None
image_pipe = None
if not DEMO_MODE:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
print("[LifeLog] Loading text model…")
text_tokenizer = AutoTokenizer.from_pretrained(
MODEL_TEXT, trust_remote_code=True
)
text_model = AutoModelForCausalLM.from_pretrained(
MODEL_TEXT,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
)
print("[LifeLog] Loading ASR model…")
asr_pipe = pipeline(
"automatic-speech-recognition",
model=MODEL_ASR,
torch_dtype=torch.float16,
device_map="auto",
)
print("[LifeLog] Loading vision model…")
vision_tokenizer = AutoTokenizer.from_pretrained(
MODEL_VISION, trust_remote_code=True
)
# Patch: MiniCPM-V's custom model class lacks all_tied_weights_keys
# which newer transformers expects during from_pretrained.
from transformers import PreTrainedModel
if not hasattr(PreTrainedModel, "all_tied_weights_keys"):
PreTrainedModel.all_tied_weights_keys = {}
vision_model = AutoModelForCausalLM.from_pretrained(
MODEL_VISION,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
)
print("[LifeLog] Loading image generation model…")
from diffusers import FluxPipeline
image_pipe = FluxPipeline.from_pretrained(
MODEL_IMAGE, torch_dtype=torch.bfloat16
)
image_pipe.enable_model_cpu_offload()
print("[LifeLog] All models loaded.")
# ---------------------------------------------------------------------------
# Inference functions
# ---------------------------------------------------------------------------
@_gpu_decorator(duration=60)
def generate_text(messages: list[dict], max_tokens: int = 512) -> str:
if DEMO_MODE:
last = messages[-1].get("content", "") if messages else ""
lower = last.lower()
if "consequence" in lower or ("predict" in lower and "json" in lower):
return _DEMO_PREDICT
if "category" in lower and "json" in lower:
return _DEMO_CATEGORIZE
if "image prompt" in lower or "moment card" in lower:
return _DEMO_CARD_PROMPT
if "pattern" in lower or "debug report" in lower:
return _DEMO_PATTERN
if "#1" in last:
return _DEMO_FOLLOW_UPS[0]
if "#2" in last:
return _DEMO_FOLLOW_UPS[1]
if "#3" in last:
return _DEMO_FOLLOW_UPS[2]
return _DEMO_FOLLOW_UPS[0]
text = text_tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = text_tokenizer([text], return_tensors="pt").to(text_model.device)
output_ids = text_model.generate(
**inputs, max_new_tokens=max_tokens, temperature=0.7, do_sample=True
)
output_ids = output_ids[:, inputs.input_ids.shape[-1]:]
return text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
@_gpu_decorator(duration=30)
def transcribe_audio(audio_path: str) -> str:
if DEMO_MODE:
return "I decided to leave my current job and pursue freelancing full-time."
result = asr_pipe(audio_path)
return result["text"]
@_gpu_decorator(duration=60)
def describe_image(image_path: str, question: str) -> str:
if DEMO_MODE:
return _DEMO_IMAGE_DESC
image = Image.open(image_path).convert("RGB")
msgs = [{"role": "user", "content": question}]
response = vision_model.chat(
image=image, msgs=msgs, tokenizer=vision_tokenizer
)
return response
@_gpu_decorator(duration=120)
def generate_moment_card(prompt: str) -> Image.Image:
if DEMO_MODE:
img = Image.new("RGB", (512, 512), color=(22, 27, 34))
draw = ImageDraw.Draw(img)
try:
font = ImageFont.truetype("arial.ttf", 18)
except OSError:
font = ImageFont.load_default()
draw.multiline_text(
(256, 230),
"Moment Card\n(Demo Mode)",
fill=(34, 197, 94),
font=font,
anchor="mm",
align="center",
)
draw.rectangle([20, 20, 492, 492], outline=(48, 54, 61), width=2)
return img
import torch
image = image_pipe(
prompt=prompt,
height=512,
width=512,
guidance_scale=0.0,
num_inference_steps=4,
max_sequence_length=256,
generator=torch.Generator(device="cpu").manual_seed(0),
).images[0]
return image