Spaces:

Valtry
/

summarizer

Sleeping

App Files Files Community

Valtry commited on 18 days ago

Commit

52b7f3e

verified ·

1 Parent(s): b1ec228

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -29

app.py CHANGED Viewed

@@ -4,14 +4,24 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import uvicorn
 app = FastAPI()
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
 print("🚀 Loading Memory Summarizer...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16 if device == "cuda" else torch.float32,
@@ -20,39 +30,91 @@ model = AutoModelForCausalLM.from_pretrained(
 print(f"✅ Loaded on {device.upper()}")
 class SummaryRequest(BaseModel):
     old_memory: str = ""
     user_message: str
     assistant_message: str
-SYSTEM_PROMPT = """You are a memory compression engine. Your only job is to merge EXISTING MEMORY and NEW CONVERSATION into one updated memory paragraph.
-Output rules:
-- Write in third-person past tense (e.g. "User built...", "User asked...", "Assistant suggested...")
-- One dense paragraph, no bullet points, no headers, no lists
-- Preserve ALL technical details: stack, frameworks, APIs, models, tools, databases, architecture decisions, unfinished tasks, user preferences
-- Add new information from the conversation into the existing memory
-- Never drop existing memory facts unless they are directly contradicted by new information
-- Never write as an assistant giving advice
-- Never use "you" or "I"
-- Never explain, never answer, never continue the conversation
-- Output ONLY the updated memory paragraph, nothing else
-"""
 @app.post("/generate-summary")
 def generate_summary(req: SummaryRequest):
-    user_content = f"""OLD MEMORY:
-{req.old_memory if req.old_memory else "(none)"}
-NEW USER MESSAGE:
-{req.user_message}
-NEW ASSISTANT RESPONSE:
-{req.assistant_message}
 UPDATED MEMORY:"""
     messages = [
@@ -60,13 +122,24 @@ UPDATED MEMORY:"""
         {"role": "user",   "content": user_content},
     ]
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True
     )
-    inputs = tokenizer(text, return_tensors="pt").to(model.device)
     output = model.generate(
         **inputs,
@@ -76,31 +149,64 @@ UPDATED MEMORY:"""
         eos_token_id=tokenizer.eos_token_id,
     )
     result = tokenizer.decode(
         output[0][inputs.input_ids.shape[1]:],
         skip_special_tokens=True
     ).strip()
-    # Strip any leaked stop tokens or role prefixes
-    for stop in ["<|im_end|>", "<|endoftext|>", "UPDATED MEMORY:", "User:", "Assistant:"]:
-        if stop in result:
-            result = result.split(stop)[0].strip()
     # Deduplicate lines
     seen, lines = set(), []
     for line in result.splitlines():
         line = line.strip()
         if line and line not in seen:
             seen.add(line)
             lines.append(line)
-    return {"memory": " ".join(lines)}
 @app.get("/")
 def root():
     return {"status": "Memory Summarizer Running 🚀"}
 if __name__ == "__main__":
-    uvicorn.run("app:app", host="0.0.0.0", port=7860)

 import torch
 import uvicorn
+# =========================
+# APP
+# =========================
 app = FastAPI()
+# =========================
+# MODEL
+# =========================
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
 print("🚀 Loading Memory Summarizer...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16 if device == "cuda" else torch.float32,
 print(f"✅ Loaded on {device.upper()}")
+# =========================
+# SYSTEM PROMPT
+# =========================
+SYSTEM_PROMPT = """You are a memory compression engine.
+EXAMPLE 1:
+EXISTING MEMORY: User building a todo app with React and Firebase.
+USER SAID: Can I add offline support?
+ASSISTANT REPLIED: Use Firebase offline persistence by enabling it in the SDK config.
+UPDATED MEMORY: User building todo app with React and Firebase. Offline persistence enabled via Firebase SDK config.
+EXAMPLE 2:
+EXISTING MEMORY: User building REST API with FastAPI and PostgreSQL. Using JWT for auth.
+USER SAID: How do I add rate limiting?
+ASSISTANT REPLIED: Use slowapi library with FastAPI. Attach the limiter to the app instance and decorate routes.
+UPDATED MEMORY: User building REST API with FastAPI and PostgreSQL using JWT auth and slowapi-based rate limiting on routes.
+EXAMPLE 3:
+EXISTING MEMORY: User building CLI tool in Python to rename files in bulk. Uses argparse and pathlib.
+USER SAID: I want to add a dry-run mode that shows changes without applying them.
+ASSISTANT REPLIED: Add a --dry-run flag via argparse. When set, print the rename operations instead of executing them.
+UPDATED MEMORY: User building Python CLI bulk rename tool using argparse and pathlib. Supports dry-run mode via --dry-run flag that prints operations without executing.
+EXAMPLE 4:
+EXISTING MEMORY: (none)
+USER SAID: I am building an e-commerce backend with Django and Stripe for payments.
+ASSISTANT REPLIED: Use stripe-python SDK directly. Handle webhooks via a dedicated endpoint with signature verification.
+UPDATED MEMORY: User building e-commerce backend with Django and Stripe. Payments via stripe-python SDK with webhook endpoint using signature verification.
+EXAMPLE 5:
+EXISTING MEMORY: User building AI chatbot with FastAPI and Supabase for storage. Supports streaming responses.
+USER SAID: How do I add conversation branching?
+ASSISTANT REPLIED: Store a parent_message_id on each message in Supabase. Query by branch to reconstruct any conversation path.
+UPDATED MEMORY: User building AI chatbot with FastAPI and Supabase. Supports streaming responses and conversation branching via parent_message_id stored per message.
+EXAMPLE 6:
+EXISTING MEMORY: User building image processing pipeline in Python using OpenCV. Handles batch resizing.
+USER SAID: I want to add face detection now.
+ASSISTANT REPLIED: Use OpenCV Haar cascades or switch to mediapipe for better accuracy on varied lighting.
+UPDATED MEMORY: User building image processing pipeline in Python using OpenCV. Handles batch resizing and face detection via Haar cascades or mediapipe for varied lighting.
+EXAMPLE 7:
+EXISTING MEMORY: (none)
+USER SAID: I want to build a habit tracker mobile app using Flutter and SQLite.
+ASSISTANT REPLIED: Use sqflite package for SQLite in Flutter. Store habits and daily completion records in separate tables.
+UPDATED MEMORY: User building Flutter habit tracker app using sqflite for SQLite storage. Habits and daily completion records stored in separate tables.
+EXAMPLE 8:
+EXISTING MEMORY: User building a portfolio website with Next.js and Tailwind. Deployed on Vercel.
+USER SAID: How do I add a blog section with markdown support?
+ASSISTANT REPLIED: Use next-mdx-remote to parse and render markdown files. Store posts as .mdx files in a /content folder.
+UPDATED MEMORY: User building portfolio website with Next.js and Tailwind deployed on Vercel. Blog section added using next-mdx-remote with .mdx files stored in /content folder.
+Now do the same task.
+Rules:
+- Merge EXISTING MEMORY with the new conversation into one updated memory.
+- Preserve all technical details: stack, frameworks, APIs, models, tools, databases, architecture decisions, unfinished tasks, user preferences.
+- Never drop existing memory facts unless directly contradicted by new information.
+- Write in third-person. No "you". No "I".
+- Output ONLY the updated memory. No labels. No explanation. No bullet points. No extra text."""
+# =========================
+# REQUEST MODEL
+# =========================
 class SummaryRequest(BaseModel):
     old_memory: str = ""
     user_message: str
     assistant_message: str
+# =========================
+# SUMMARY ENDPOINT
+# =========================
 @app.post("/generate-summary")
 def generate_summary(req: SummaryRequest):
+    old_memory = req.old_memory.strip() if req.old_memory.strip() else "(none)"
+    user_message = req.user_message.strip()
+    assistant_message = req.assistant_message.strip()[:600]
+    user_content = f"""EXISTING MEMORY: {old_memory}
+USER SAID: {user_message}
+ASSISTANT REPLIED: {assistant_message}
 UPDATED MEMORY:"""
     messages = [
         {"role": "user",   "content": user_content},
     ]
+    # =========================
+    # FORMAT CHAT
+    # =========================
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True
     )
+    inputs = tokenizer(
+        text,
+        return_tensors="pt"
+    ).to(model.device)
+    # =========================
+    # GENERATE
+    # =========================
     output = model.generate(
         **inputs,
         eos_token_id=tokenizer.eos_token_id,
     )
+    # =========================
+    # DECODE
+    # =========================
     result = tokenizer.decode(
         output[0][inputs.input_ids.shape[1]:],
         skip_special_tokens=True
     ).strip()
+    # =========================
+    # CLEAN OUTPUT
+    # =========================
+    stop_phrases = [
+        "<|im_end|>",
+        "<|endoftext|>",
+        "UPDATED MEMORY:",
+        "EXISTING MEMORY:",
+        "USER SAID:",
+        "ASSISTANT REPLIED:",
+        "EXAMPLE ",
+        "Now do the same",
+        "Assistant:",
+        "User:",
+    ]
+    for phrase in stop_phrases:
+        if phrase in result:
+            result = result.split(phrase)[0].strip()
     # Deduplicate lines
     seen, lines = set(), []
     for line in result.splitlines():
         line = line.strip()
         if line and line not in seen:
             seen.add(line)
             lines.append(line)
+    result = " ".join(lines).strip()
+    return {"memory": result}
+# =========================
+# HEALTH
+# =========================
 @app.get("/")
 def root():
     return {"status": "Memory Summarizer Running 🚀"}
+# =========================
+# RUN
+# =========================
 if __name__ == "__main__":
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=7860
+    )