Spaces:

Valtry
/

summarizer

Sleeping

App Files Files Community

Valtry commited on 21 days ago

Commit

c44f9a3

verified ·

1 Parent(s): 14f831d

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -83

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import uvicorn
 # =========================
 # APP
@@ -16,7 +17,7 @@ app = FastAPI()
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
-print("🚀 Loading Memory Summarizer...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -31,7 +32,7 @@ model = AutoModelForCausalLM.from_pretrained(
 print(f"✅ Loaded on {device.upper()}")
 # =========================
-# REQUEST MODEL
 # =========================
 class SummaryRequest(BaseModel):
@@ -40,78 +41,157 @@ class SummaryRequest(BaseModel):
     assistant_message: str
 # =========================
-# SUMMARY ENDPOINT
 # =========================
-@app.post("/generate-summary")
-def generate_summary(req: SummaryRequest):
-    prompt = f"""
-You are a memory compression engine.
-Your job:
-Merge OLD MEMORY with NEW CONVERSATION into ONE updated memory.
-IMPORTANT RULES:
-- Preserve ALL important technical details
-- Preserve frameworks, APIs, models, tools, databases
-- Preserve architecture decisions
-- Preserve project goals
-- Preserve unfinished tasks
 - Preserve user preferences
-- Remove filler and repetition
 - Compress intelligently
 - NEVER answer the user
-- NEVER explain anything
-- NEVER act like assistant
-- ONLY return compressed memory
-MEMORY FORMAT:
-- Short
-- Dense
-- Informational
-- Technical
-- Third-person style
-EXAMPLE:
-OLD MEMORY:
-User building AI chatbot with FastAPI.
-NEW USER MESSAGE:
-How to add Supabase memory?
-NEW ASSISTANT RESPONSE:
-Use Supabase to store conversations and summaries.
-UPDATED MEMORY:
-User building AI chatbot using FastAPI and Supabase conversation storage.
-NOW DO THE TASK.
-OLD MEMORY:
 {req.old_memory}
-NEW USER MESSAGE:
 {req.user_message}
-NEW ASSISTANT RESPONSE:
 {req.assistant_message}
-UPDATED MEMORY:
 """
     messages = [
         {
             "role": "user",
-            "content": prompt
         }
     ]
-    # =========================
-    # FORMAT CHAT
-    # =========================
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
@@ -120,7 +200,9 @@ UPDATED MEMORY:
     inputs = tokenizer(
         text,
-        return_tensors="pt"
     ).to(model.device)
     # =========================
@@ -130,10 +212,10 @@ UPDATED MEMORY:
     output = model.generate(
         **inputs,
         max_new_tokens=180,
-        do_sample=False,
-        temperature=0.0,
-        top_p=1.0,
-        repetition_penalty=1.1,
         eos_token_id=tokenizer.eos_token_id
     )
@@ -147,41 +229,10 @@ UPDATED MEMORY:
     )
     # =========================
-    # CLEAN OUTPUT
     # =========================
-    stop_words = [
-        "<|im_end|>",
-        "<|endoftext|>",
-        "UPDATED MEMORY:",
-        "Assistant:",
-        "User:"
-    ]
-    for w in stop_words:
-        if w in result:
-            result = result.split(w)[0]
-    result = result.strip()
-    # remove repeated lines
-    lines = []
-    seen = set()
-    for line in result.split("\n"):
-        line = line.strip()
-        if not line:
-            continue
-        if line in seen:
-            continue
-        seen.add(line)
-        lines.append(line)
-    result = " ".join(lines)
     return {
         "memory": result
@@ -193,8 +244,9 @@ UPDATED MEMORY:
 @app.get("/")
 def root():
     return {
-        "status": "Memory Summarizer Running 🚀"
     }
 # =========================
@@ -202,6 +254,7 @@ def root():
 # =========================
 if __name__ == "__main__":
     uvicorn.run(
         "app:app",
         host="0.0.0.0",

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import uvicorn
+import re
 # =========================
 # APP
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
+print("🚀 Loading Recursive Memory Summarizer...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"✅ Loaded on {device.upper()}")
 # =========================
+# REQUEST
 # =========================
 class SummaryRequest(BaseModel):
     assistant_message: str
 # =========================
+# CLEAN
 # =========================
+def clean_output(text):
+    stop_words = [
+        "<|im_end|>",
+        "<|endoftext|>",
+        "<|eot_id|>",
+        "UPDATED_MEMORY:",
+        "MEMORY:",
+        "Assistant:",
+        "User:"
+    ]
+    for w in stop_words:
+        if w in text:
+            text = text.split(w)[0]
+    text = text.strip()
+    # remove duplicate lines
+    lines = []
+    seen = set()
+    for line in text.split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+        if line in seen:
+            continue
+        seen.add(line)
+        lines.append(line)
+    text = "\n".join(lines)
+    # remove too many spaces
+    text = re.sub(r"\n+", "\n", text)
+    return text.strip()
+# =========================
+# SYSTEM PROMPT
+# =========================
+SYSTEM_PROMPT = """
+You are a recursive AI memory summarization engine.
+Your ONLY task:
+Maintain long-term conversational memory.
+IMPORTANT:
+This memory is used by another AI model later.
+GOALS:
+- Preserve important discussion context
+- Preserve coding discussions
+- Preserve project details
+- Preserve goals
+- Preserve plans
+- Preserve technical information
 - Preserve user preferences
+- Preserve ongoing tasks
+- Preserve implementation ideas
+- Preserve important explanations
+REMOVE:
+- filler
+- greetings
+- repetition
+- unnecessary wording
+- casual conversation fluff
+RULES:
+- Merge old memory with new conversation
 - Compress intelligently
+- Keep important meaning
+- Keep memory compact
+- Keep memory understandable for another AI
 - NEVER answer the user
+- NEVER explain
+- ONLY output updated memory
+GOOD MEMORY STYLE:
+User building local AI assistant using FastAPI and llama.cpp. Uses Supabase storage and streaming responses. Implementing recursive memory summarization and title generation using lightweight Qwen models.
+BAD MEMORY STYLE:
+The user asked this. The assistant replied this.
+ONLY OUTPUT MEMORY.
+"""
+# =========================
+# SUMMARY ENDPOINT
+# =========================
+@app.post("/generate-summary")
+def generate_summary(req: SummaryRequest):
+    # =========================
+    # USER PROMPT
+    # =========================
+    user_prompt = f"""
+OLD_MEMORY:
 {req.old_memory}
+NEW_USER_MESSAGE:
 {req.user_message}
+NEW_ASSISTANT_MESSAGE:
 {req.assistant_message}
+TASK:
+Generate updated long-term memory summary.
+IMPORTANT:
+- Merge previous memory with new discussion
+- Preserve technical/coding context
+- Preserve important conversation flow
+- Preserve ongoing project details
+- Preserve implementation discussions
+- Preserve future plans/goals
+- Keep compact but meaningful
+- Keep understandable for another AI model
+UPDATED_MEMORY:
 """
+    # =========================
+    # CHAT FORMAT
+    # =========================
     messages = [
+        {
+            "role": "system",
+            "content": SYSTEM_PROMPT
+        },
         {
             "role": "user",
+            "content": user_prompt
         }
     ]
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
     inputs = tokenizer(
         text,
+        return_tensors="pt",
+        truncation=True,
+        max_length=4096
     ).to(model.device)
     # =========================
     output = model.generate(
         **inputs,
         max_new_tokens=180,
+        do_sample=True,
+        temperature=0.2,
+        top_p=0.9,
+        repetition_penalty=1.15,
         eos_token_id=tokenizer.eos_token_id
     )
     )
     # =========================
+    # CLEAN
     # =========================
+    result = clean_output(result)
     return {
         "memory": result
 @app.get("/")
 def root():
     return {
+        "status": "Recursive Memory Summarizer Running 🚀"
     }
 # =========================
 # =========================
 if __name__ == "__main__":
     uvicorn.run(
         "app:app",
         host="0.0.0.0",