Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,14 +4,24 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
| 4 |
import torch
|
| 5 |
import uvicorn
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
app = FastAPI()
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
|
| 10 |
|
| 11 |
print("🚀 Loading Memory Summarizer...")
|
| 12 |
|
| 13 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 14 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
|
|
|
| 15 |
model = AutoModelForCausalLM.from_pretrained(
|
| 16 |
MODEL_ID,
|
| 17 |
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
|
|
@@ -20,39 +30,91 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
| 20 |
|
| 21 |
print(f"✅ Loaded on {device.upper()}")
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
class SummaryRequest(BaseModel):
|
| 25 |
old_memory: str = ""
|
| 26 |
user_message: str
|
| 27 |
assistant_message: str
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
Output rules:
|
| 33 |
-
- Write in third-person past tense (e.g. "User built...", "User asked...", "Assistant suggested...")
|
| 34 |
-
- One dense paragraph, no bullet points, no headers, no lists
|
| 35 |
-
- Preserve ALL technical details: stack, frameworks, APIs, models, tools, databases, architecture decisions, unfinished tasks, user preferences
|
| 36 |
-
- Add new information from the conversation into the existing memory
|
| 37 |
-
- Never drop existing memory facts unless they are directly contradicted by new information
|
| 38 |
-
- Never write as an assistant giving advice
|
| 39 |
-
- Never use "you" or "I"
|
| 40 |
-
- Never explain, never answer, never continue the conversation
|
| 41 |
-
- Output ONLY the updated memory paragraph, nothing else
|
| 42 |
-
"""
|
| 43 |
|
| 44 |
@app.post("/generate-summary")
|
| 45 |
def generate_summary(req: SummaryRequest):
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
NEW USER MESSAGE:
|
| 51 |
-
{req.user_message}
|
| 52 |
-
|
| 53 |
-
NEW ASSISTANT RESPONSE:
|
| 54 |
-
{req.assistant_message}
|
| 55 |
|
|
|
|
|
|
|
|
|
|
| 56 |
UPDATED MEMORY:"""
|
| 57 |
|
| 58 |
messages = [
|
|
@@ -60,13 +122,24 @@ UPDATED MEMORY:"""
|
|
| 60 |
{"role": "user", "content": user_content},
|
| 61 |
]
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
text = tokenizer.apply_chat_template(
|
| 64 |
messages,
|
| 65 |
tokenize=False,
|
| 66 |
add_generation_prompt=True
|
| 67 |
)
|
| 68 |
|
| 69 |
-
inputs = tokenizer(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
output = model.generate(
|
| 72 |
**inputs,
|
|
@@ -76,31 +149,64 @@ UPDATED MEMORY:"""
|
|
| 76 |
eos_token_id=tokenizer.eos_token_id,
|
| 77 |
)
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
result = tokenizer.decode(
|
| 80 |
output[0][inputs.input_ids.shape[1]:],
|
| 81 |
skip_special_tokens=True
|
| 82 |
).strip()
|
| 83 |
|
| 84 |
-
#
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
# Deduplicate lines
|
| 90 |
seen, lines = set(), []
|
|
|
|
| 91 |
for line in result.splitlines():
|
| 92 |
line = line.strip()
|
| 93 |
if line and line not in seen:
|
| 94 |
seen.add(line)
|
| 95 |
lines.append(line)
|
| 96 |
|
| 97 |
-
|
|
|
|
|
|
|
| 98 |
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
@app.get("/")
|
| 101 |
def root():
|
| 102 |
return {"status": "Memory Summarizer Running 🚀"}
|
| 103 |
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
if __name__ == "__main__":
|
| 106 |
-
uvicorn.run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import torch
|
| 5 |
import uvicorn
|
| 6 |
|
| 7 |
+
# =========================
|
| 8 |
+
# APP
|
| 9 |
+
# =========================
|
| 10 |
+
|
| 11 |
app = FastAPI()
|
| 12 |
|
| 13 |
+
# =========================
|
| 14 |
+
# MODEL
|
| 15 |
+
# =========================
|
| 16 |
+
|
| 17 |
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
|
| 18 |
|
| 19 |
print("🚀 Loading Memory Summarizer...")
|
| 20 |
|
| 21 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 22 |
+
|
| 23 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 24 |
+
|
| 25 |
model = AutoModelForCausalLM.from_pretrained(
|
| 26 |
MODEL_ID,
|
| 27 |
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
|
|
|
|
| 30 |
|
| 31 |
print(f"✅ Loaded on {device.upper()}")
|
| 32 |
|
| 33 |
+
# =========================
|
| 34 |
+
# SYSTEM PROMPT
|
| 35 |
+
# =========================
|
| 36 |
+
|
| 37 |
+
SYSTEM_PROMPT = """You are a memory compression engine.
|
| 38 |
+
|
| 39 |
+
EXAMPLE 1:
|
| 40 |
+
EXISTING MEMORY: User building a todo app with React and Firebase.
|
| 41 |
+
USER SAID: Can I add offline support?
|
| 42 |
+
ASSISTANT REPLIED: Use Firebase offline persistence by enabling it in the SDK config.
|
| 43 |
+
UPDATED MEMORY: User building todo app with React and Firebase. Offline persistence enabled via Firebase SDK config.
|
| 44 |
+
|
| 45 |
+
EXAMPLE 2:
|
| 46 |
+
EXISTING MEMORY: User building REST API with FastAPI and PostgreSQL. Using JWT for auth.
|
| 47 |
+
USER SAID: How do I add rate limiting?
|
| 48 |
+
ASSISTANT REPLIED: Use slowapi library with FastAPI. Attach the limiter to the app instance and decorate routes.
|
| 49 |
+
UPDATED MEMORY: User building REST API with FastAPI and PostgreSQL using JWT auth and slowapi-based rate limiting on routes.
|
| 50 |
+
|
| 51 |
+
EXAMPLE 3:
|
| 52 |
+
EXISTING MEMORY: User building CLI tool in Python to rename files in bulk. Uses argparse and pathlib.
|
| 53 |
+
USER SAID: I want to add a dry-run mode that shows changes without applying them.
|
| 54 |
+
ASSISTANT REPLIED: Add a --dry-run flag via argparse. When set, print the rename operations instead of executing them.
|
| 55 |
+
UPDATED MEMORY: User building Python CLI bulk rename tool using argparse and pathlib. Supports dry-run mode via --dry-run flag that prints operations without executing.
|
| 56 |
+
|
| 57 |
+
EXAMPLE 4:
|
| 58 |
+
EXISTING MEMORY: (none)
|
| 59 |
+
USER SAID: I am building an e-commerce backend with Django and Stripe for payments.
|
| 60 |
+
ASSISTANT REPLIED: Use stripe-python SDK directly. Handle webhooks via a dedicated endpoint with signature verification.
|
| 61 |
+
UPDATED MEMORY: User building e-commerce backend with Django and Stripe. Payments via stripe-python SDK with webhook endpoint using signature verification.
|
| 62 |
+
|
| 63 |
+
EXAMPLE 5:
|
| 64 |
+
EXISTING MEMORY: User building AI chatbot with FastAPI and Supabase for storage. Supports streaming responses.
|
| 65 |
+
USER SAID: How do I add conversation branching?
|
| 66 |
+
ASSISTANT REPLIED: Store a parent_message_id on each message in Supabase. Query by branch to reconstruct any conversation path.
|
| 67 |
+
UPDATED MEMORY: User building AI chatbot with FastAPI and Supabase. Supports streaming responses and conversation branching via parent_message_id stored per message.
|
| 68 |
+
|
| 69 |
+
EXAMPLE 6:
|
| 70 |
+
EXISTING MEMORY: User building image processing pipeline in Python using OpenCV. Handles batch resizing.
|
| 71 |
+
USER SAID: I want to add face detection now.
|
| 72 |
+
ASSISTANT REPLIED: Use OpenCV Haar cascades or switch to mediapipe for better accuracy on varied lighting.
|
| 73 |
+
UPDATED MEMORY: User building image processing pipeline in Python using OpenCV. Handles batch resizing and face detection via Haar cascades or mediapipe for varied lighting.
|
| 74 |
+
|
| 75 |
+
EXAMPLE 7:
|
| 76 |
+
EXISTING MEMORY: (none)
|
| 77 |
+
USER SAID: I want to build a habit tracker mobile app using Flutter and SQLite.
|
| 78 |
+
ASSISTANT REPLIED: Use sqflite package for SQLite in Flutter. Store habits and daily completion records in separate tables.
|
| 79 |
+
UPDATED MEMORY: User building Flutter habit tracker app using sqflite for SQLite storage. Habits and daily completion records stored in separate tables.
|
| 80 |
+
|
| 81 |
+
EXAMPLE 8:
|
| 82 |
+
EXISTING MEMORY: User building a portfolio website with Next.js and Tailwind. Deployed on Vercel.
|
| 83 |
+
USER SAID: How do I add a blog section with markdown support?
|
| 84 |
+
ASSISTANT REPLIED: Use next-mdx-remote to parse and render markdown files. Store posts as .mdx files in a /content folder.
|
| 85 |
+
UPDATED MEMORY: User building portfolio website with Next.js and Tailwind deployed on Vercel. Blog section added using next-mdx-remote with .mdx files stored in /content folder.
|
| 86 |
+
|
| 87 |
+
Now do the same task.
|
| 88 |
+
Rules:
|
| 89 |
+
- Merge EXISTING MEMORY with the new conversation into one updated memory.
|
| 90 |
+
- Preserve all technical details: stack, frameworks, APIs, models, tools, databases, architecture decisions, unfinished tasks, user preferences.
|
| 91 |
+
- Never drop existing memory facts unless directly contradicted by new information.
|
| 92 |
+
- Write in third-person. No "you". No "I".
|
| 93 |
+
- Output ONLY the updated memory. No labels. No explanation. No bullet points. No extra text."""
|
| 94 |
+
|
| 95 |
+
# =========================
|
| 96 |
+
# REQUEST MODEL
|
| 97 |
+
# =========================
|
| 98 |
|
| 99 |
class SummaryRequest(BaseModel):
|
| 100 |
old_memory: str = ""
|
| 101 |
user_message: str
|
| 102 |
assistant_message: str
|
| 103 |
|
| 104 |
+
# =========================
|
| 105 |
+
# SUMMARY ENDPOINT
|
| 106 |
+
# =========================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
@app.post("/generate-summary")
|
| 109 |
def generate_summary(req: SummaryRequest):
|
| 110 |
|
| 111 |
+
old_memory = req.old_memory.strip() if req.old_memory.strip() else "(none)"
|
| 112 |
+
user_message = req.user_message.strip()
|
| 113 |
+
assistant_message = req.assistant_message.strip()[:600]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
user_content = f"""EXISTING MEMORY: {old_memory}
|
| 116 |
+
USER SAID: {user_message}
|
| 117 |
+
ASSISTANT REPLIED: {assistant_message}
|
| 118 |
UPDATED MEMORY:"""
|
| 119 |
|
| 120 |
messages = [
|
|
|
|
| 122 |
{"role": "user", "content": user_content},
|
| 123 |
]
|
| 124 |
|
| 125 |
+
# =========================
|
| 126 |
+
# FORMAT CHAT
|
| 127 |
+
# =========================
|
| 128 |
+
|
| 129 |
text = tokenizer.apply_chat_template(
|
| 130 |
messages,
|
| 131 |
tokenize=False,
|
| 132 |
add_generation_prompt=True
|
| 133 |
)
|
| 134 |
|
| 135 |
+
inputs = tokenizer(
|
| 136 |
+
text,
|
| 137 |
+
return_tensors="pt"
|
| 138 |
+
).to(model.device)
|
| 139 |
+
|
| 140 |
+
# =========================
|
| 141 |
+
# GENERATE
|
| 142 |
+
# =========================
|
| 143 |
|
| 144 |
output = model.generate(
|
| 145 |
**inputs,
|
|
|
|
| 149 |
eos_token_id=tokenizer.eos_token_id,
|
| 150 |
)
|
| 151 |
|
| 152 |
+
# =========================
|
| 153 |
+
# DECODE
|
| 154 |
+
# =========================
|
| 155 |
+
|
| 156 |
result = tokenizer.decode(
|
| 157 |
output[0][inputs.input_ids.shape[1]:],
|
| 158 |
skip_special_tokens=True
|
| 159 |
).strip()
|
| 160 |
|
| 161 |
+
# =========================
|
| 162 |
+
# CLEAN OUTPUT
|
| 163 |
+
# =========================
|
| 164 |
+
|
| 165 |
+
stop_phrases = [
|
| 166 |
+
"<|im_end|>",
|
| 167 |
+
"<|endoftext|>",
|
| 168 |
+
"UPDATED MEMORY:",
|
| 169 |
+
"EXISTING MEMORY:",
|
| 170 |
+
"USER SAID:",
|
| 171 |
+
"ASSISTANT REPLIED:",
|
| 172 |
+
"EXAMPLE ",
|
| 173 |
+
"Now do the same",
|
| 174 |
+
"Assistant:",
|
| 175 |
+
"User:",
|
| 176 |
+
]
|
| 177 |
+
|
| 178 |
+
for phrase in stop_phrases:
|
| 179 |
+
if phrase in result:
|
| 180 |
+
result = result.split(phrase)[0].strip()
|
| 181 |
|
| 182 |
# Deduplicate lines
|
| 183 |
seen, lines = set(), []
|
| 184 |
+
|
| 185 |
for line in result.splitlines():
|
| 186 |
line = line.strip()
|
| 187 |
if line and line not in seen:
|
| 188 |
seen.add(line)
|
| 189 |
lines.append(line)
|
| 190 |
|
| 191 |
+
result = " ".join(lines).strip()
|
| 192 |
+
|
| 193 |
+
return {"memory": result}
|
| 194 |
|
| 195 |
+
# =========================
|
| 196 |
+
# HEALTH
|
| 197 |
+
# =========================
|
| 198 |
|
| 199 |
@app.get("/")
|
| 200 |
def root():
|
| 201 |
return {"status": "Memory Summarizer Running 🚀"}
|
| 202 |
|
| 203 |
+
# =========================
|
| 204 |
+
# RUN
|
| 205 |
+
# =========================
|
| 206 |
|
| 207 |
if __name__ == "__main__":
|
| 208 |
+
uvicorn.run(
|
| 209 |
+
"app:app",
|
| 210 |
+
host="0.0.0.0",
|
| 211 |
+
port=7860
|
| 212 |
+
)
|