Spaces:

Valtry
/

summarizer

Sleeping

App Files Files Community

Valtry commited on 17 days ago

Commit

d795cbb

verified ·

1 Parent(s): d040ad0

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -234

app.py CHANGED Viewed

@@ -3,211 +3,57 @@ from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import uvicorn
-import re
-# =========================
-# APP
-# =========================
 app = FastAPI()
-# =========================
-# MODEL
-# =========================
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
-print("🚀 Loading Recursive Memory Summarizer...")
-device = torch.device(
-    "cuda" if torch.cuda.is_available() else "cpu"
-)
-# =========================
-# TOKENIZER
-# =========================
-tokenizer = AutoTokenizer.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True
-)
-# =========================
-# MODEL LOAD
-# =========================
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
-    trust_remote_code=True
 )
-model = model.to(device)
-print(f"✅ Loaded on {device}")
-# =========================
-# REQUEST
-# =========================
 class SummaryRequest(BaseModel):
     old_memory: str = ""
     user_message: str
     assistant_message: str
-# =========================
-# CLEAN OUTPUT
-# =========================
-def clean_output(text):
-    stop_words = [
-        "<|im_end|>",
-        "<|endoftext|>",
-        "<|eot_id|>",
-        "UPDATED_MEMORY:",
-        "MEMORY:",
-        "Assistant:",
-        "User:"
-    ]
-    for w in stop_words:
-        if w in text:
-            text = text.split(w)[0]
-    text = text.strip()
-    # remove repeated lines
-    lines = []
-    seen = set()
-    for line in text.split("\n"):
-        line = line.strip()
-        if not line:
-            continue
-        if line in seen:
-            continue
-        seen.add(line)
-        lines.append(line)
-    text = "\n".join(lines)
-    # remove extra spaces/newlines
-    text = re.sub(r"\n+", "\n", text)
-    return text.strip()
-# =========================
-# SYSTEM PROMPT
-# =========================
-SYSTEM_PROMPT = """
-You are a recursive AI memory summarization engine.
-Your ONLY task:
-Maintain long-term conversational memory.
-IMPORTANT:
-This memory is used later by another AI model.
-GOALS:
-- Preserve important discussion context
-- Preserve coding discussions
-- Preserve project details
-- Preserve technical information
-- Preserve implementation ideas
-- Preserve plans and goals
-- Preserve APIs/frameworks/models
-- Preserve architecture decisions
-- Preserve ongoing tasks
-- Preserve debugging context
-REMOVE:
-- filler
-- greetings
-- repeated information
-- unnecessary wording
-- casual conversation fluff
-RULES:
-- Merge old memory with new conversation
-- Compress intelligently
-- Keep memory compact
-- Keep memory understandable for another AI
-- NEVER answer the user
-- NEVER explain
-- ONLY output updated memory
-GOOD MEMORY STYLE:
-User building local AI assistant using FastAPI and llama.cpp. Uses Supabase storage and streaming responses. Implementing recursive memory summarization and title generation using lightweight Qwen models.
-BAD MEMORY STYLE:
-The user asked this. The assistant replied this.
-ONLY OUTPUT MEMORY.
-"""
-# =========================
-# SUMMARY ENDPOINT
-# =========================
 @app.post("/generate-summary")
 def generate_summary(req: SummaryRequest):
-    # =========================
-    # TRUNCATE HUGE INPUTS
-    # =========================
-    old_memory = req.old_memory[-3000:]
-    user_message = req.user_message[-1500:]
-    assistant_message = req.assistant_message[-3000:]
-    # =========================
-    # USER PROMPT
-    # =========================
-    user_prompt = f"""
-OLD_MEMORY:
-{old_memory}
-NEW_USER_MESSAGE:
-{user_message}
-NEW_ASSISTANT_MESSAGE:
-{assistant_message}
-TASK:
-Generate updated long-term memory summary.
-IMPORTANT:
-- Merge previous memory with new discussion
-- Preserve coding/technical context
-- Preserve important conversation flow
-- Preserve implementation discussions
-- Preserve project goals/plans
-- Keep compact but meaningful
-- Keep understandable for another AI model
-UPDATED_MEMORY:
-"""
-    # =========================
-    # CHAT FORMAT
-    # =========================
     messages = [
-        {
-            "role": "system",
-            "content": SYSTEM_PROMPT
-        },
-        {
-            "role": "user",
-            "content": user_prompt
-        }
     ]
     text = tokenizer.apply_chat_template(
@@ -216,76 +62,41 @@ UPDATED_MEMORY:
         add_generation_prompt=True
     )
-    # =========================
-    # TOKENIZE
-    # =========================
-    inputs = tokenizer(
-        text,
-        return_tensors="pt",
-        truncation=True,
-        max_length=4096
-    ).to(device)
-    # =========================
-    # GENERATE
-    # =========================
-    with torch.no_grad():
-        output = model.generate(
-            **inputs,
-            max_new_tokens=120,
-            do_sample=True,
-            temperature=0.2,
-            top_p=0.9,
-            repetition_penalty=1.15,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id
-        )
-    # =========================
-    # DECODE
-    # =========================
     result = tokenizer.decode(
         output[0][inputs.input_ids.shape[1]:],
         skip_special_tokens=True
-    )
-    # =========================
-    # CLEAN
-    # =========================
-    result = clean_output(result)
-    # =========================
-    # RESPONSE
-    # =========================
-    return {
-        "memory": result
-    }
-# =========================
-# HEALTH
-# =========================
 @app.get("/")
 def root():
-    return {
-        "status": "Recursive Memory Summarizer Running 🚀"
-    }
-# =========================
-# RUN
-# =========================
 if __name__ == "__main__":
-    uvicorn.run(
-        "app:app",
-        host="0.0.0.0",
-        port=7860
-    )

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import uvicorn
 app = FastAPI()
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
+print("🚀 Loading Memory Summarizer...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
+    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+    device_map="auto"
 )
+print(f"✅ Loaded on {device.upper()}")
 class SummaryRequest(BaseModel):
     old_memory: str = ""
     user_message: str
     assistant_message: str
+SYSTEM_PROMPT = """You are a memory compression engine.
+Merge OLD MEMORY + NEW CONVERSATION into ONE updated memory blob.
+Rules:
+- Preserve: technical stack, frameworks, APIs, architecture decisions, project goals, unfinished tasks, user preferences
+- Remove: filler, repetition, conversational fluff
+- Output style: dense, third-person, bullet-free, technical
+- Output ONLY the updated memory — no preamble, no explanation, no labels"""
 @app.post("/generate-summary")
 def generate_summary(req: SummaryRequest):
+    user_content = f"""OLD MEMORY:
+{req.old_memory if req.old_memory else "(none)"}
+NEW USER MESSAGE:
+{req.user_message}
+NEW ASSISTANT RESPONSE:
+{req.assistant_message}
+UPDATED MEMORY:"""
     messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user",   "content": user_content},
     ]
     text = tokenizer.apply_chat_template(
         add_generation_prompt=True
     )
+    inputs = tokenizer(text, return_tensors="pt").to(model.device)
+    output = model.generate(
+        **inputs,
+        max_new_tokens=200,
+        do_sample=False,
+        repetition_penalty=1.15,
+        eos_token_id=tokenizer.eos_token_id,
+    )
     result = tokenizer.decode(
         output[0][inputs.input_ids.shape[1]:],
         skip_special_tokens=True
+    ).strip()
+    # Strip any leaked stop tokens or role prefixes
+    for stop in ["<|im_end|>", "<|endoftext|>", "UPDATED MEMORY:", "User:", "Assistant:"]:
+        if stop in result:
+            result = result.split(stop)[0].strip()
+    # Deduplicate lines
+    seen, lines = set(), []
+    for line in result.splitlines():
+        line = line.strip()
+        if line and line not in seen:
+            seen.add(line)
+            lines.append(line)
+    return {"memory": " ".join(lines)}
 @app.get("/")
 def root():
+    return {"status": "Memory Summarizer Running 🚀"}
 if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=7860)