Valtry commited on
Commit
52b7f3e
·
verified ·
1 Parent(s): b1ec228

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -29
app.py CHANGED
@@ -4,14 +4,24 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
  import uvicorn
6
 
 
 
 
 
7
  app = FastAPI()
8
 
 
 
 
 
9
  MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
10
 
11
  print("🚀 Loading Memory Summarizer...")
12
 
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
14
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
15
  model = AutoModelForCausalLM.from_pretrained(
16
  MODEL_ID,
17
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
@@ -20,39 +30,91 @@ model = AutoModelForCausalLM.from_pretrained(
20
 
21
  print(f"✅ Loaded on {device.upper()}")
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  class SummaryRequest(BaseModel):
25
  old_memory: str = ""
26
  user_message: str
27
  assistant_message: str
28
 
29
-
30
- SYSTEM_PROMPT = """You are a memory compression engine. Your only job is to merge EXISTING MEMORY and NEW CONVERSATION into one updated memory paragraph.
31
-
32
- Output rules:
33
- - Write in third-person past tense (e.g. "User built...", "User asked...", "Assistant suggested...")
34
- - One dense paragraph, no bullet points, no headers, no lists
35
- - Preserve ALL technical details: stack, frameworks, APIs, models, tools, databases, architecture decisions, unfinished tasks, user preferences
36
- - Add new information from the conversation into the existing memory
37
- - Never drop existing memory facts unless they are directly contradicted by new information
38
- - Never write as an assistant giving advice
39
- - Never use "you" or "I"
40
- - Never explain, never answer, never continue the conversation
41
- - Output ONLY the updated memory paragraph, nothing else
42
- """
43
 
44
  @app.post("/generate-summary")
45
  def generate_summary(req: SummaryRequest):
46
 
47
- user_content = f"""OLD MEMORY:
48
- {req.old_memory if req.old_memory else "(none)"}
49
-
50
- NEW USER MESSAGE:
51
- {req.user_message}
52
-
53
- NEW ASSISTANT RESPONSE:
54
- {req.assistant_message}
55
 
 
 
 
56
  UPDATED MEMORY:"""
57
 
58
  messages = [
@@ -60,13 +122,24 @@ UPDATED MEMORY:"""
60
  {"role": "user", "content": user_content},
61
  ]
62
 
 
 
 
 
63
  text = tokenizer.apply_chat_template(
64
  messages,
65
  tokenize=False,
66
  add_generation_prompt=True
67
  )
68
 
69
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
 
 
 
 
 
 
 
70
 
71
  output = model.generate(
72
  **inputs,
@@ -76,31 +149,64 @@ UPDATED MEMORY:"""
76
  eos_token_id=tokenizer.eos_token_id,
77
  )
78
 
 
 
 
 
79
  result = tokenizer.decode(
80
  output[0][inputs.input_ids.shape[1]:],
81
  skip_special_tokens=True
82
  ).strip()
83
 
84
- # Strip any leaked stop tokens or role prefixes
85
- for stop in ["<|im_end|>", "<|endoftext|>", "UPDATED MEMORY:", "User:", "Assistant:"]:
86
- if stop in result:
87
- result = result.split(stop)[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  # Deduplicate lines
90
  seen, lines = set(), []
 
91
  for line in result.splitlines():
92
  line = line.strip()
93
  if line and line not in seen:
94
  seen.add(line)
95
  lines.append(line)
96
 
97
- return {"memory": " ".join(lines)}
 
 
98
 
 
 
 
99
 
100
  @app.get("/")
101
  def root():
102
  return {"status": "Memory Summarizer Running 🚀"}
103
 
 
 
 
104
 
105
  if __name__ == "__main__":
106
- uvicorn.run("app:app", host="0.0.0.0", port=7860)
 
 
 
 
 
4
  import torch
5
  import uvicorn
6
 
7
+ # =========================
8
+ # APP
9
+ # =========================
10
+
11
  app = FastAPI()
12
 
13
+ # =========================
14
+ # MODEL
15
+ # =========================
16
+
17
  MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
18
 
19
  print("🚀 Loading Memory Summarizer...")
20
 
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
+
23
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
24
+
25
  model = AutoModelForCausalLM.from_pretrained(
26
  MODEL_ID,
27
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
 
30
 
31
  print(f"✅ Loaded on {device.upper()}")
32
 
33
+ # =========================
34
+ # SYSTEM PROMPT
35
+ # =========================
36
+
37
+ SYSTEM_PROMPT = """You are a memory compression engine.
38
+
39
+ EXAMPLE 1:
40
+ EXISTING MEMORY: User building a todo app with React and Firebase.
41
+ USER SAID: Can I add offline support?
42
+ ASSISTANT REPLIED: Use Firebase offline persistence by enabling it in the SDK config.
43
+ UPDATED MEMORY: User building todo app with React and Firebase. Offline persistence enabled via Firebase SDK config.
44
+
45
+ EXAMPLE 2:
46
+ EXISTING MEMORY: User building REST API with FastAPI and PostgreSQL. Using JWT for auth.
47
+ USER SAID: How do I add rate limiting?
48
+ ASSISTANT REPLIED: Use slowapi library with FastAPI. Attach the limiter to the app instance and decorate routes.
49
+ UPDATED MEMORY: User building REST API with FastAPI and PostgreSQL using JWT auth and slowapi-based rate limiting on routes.
50
+
51
+ EXAMPLE 3:
52
+ EXISTING MEMORY: User building CLI tool in Python to rename files in bulk. Uses argparse and pathlib.
53
+ USER SAID: I want to add a dry-run mode that shows changes without applying them.
54
+ ASSISTANT REPLIED: Add a --dry-run flag via argparse. When set, print the rename operations instead of executing them.
55
+ UPDATED MEMORY: User building Python CLI bulk rename tool using argparse and pathlib. Supports dry-run mode via --dry-run flag that prints operations without executing.
56
+
57
+ EXAMPLE 4:
58
+ EXISTING MEMORY: (none)
59
+ USER SAID: I am building an e-commerce backend with Django and Stripe for payments.
60
+ ASSISTANT REPLIED: Use stripe-python SDK directly. Handle webhooks via a dedicated endpoint with signature verification.
61
+ UPDATED MEMORY: User building e-commerce backend with Django and Stripe. Payments via stripe-python SDK with webhook endpoint using signature verification.
62
+
63
+ EXAMPLE 5:
64
+ EXISTING MEMORY: User building AI chatbot with FastAPI and Supabase for storage. Supports streaming responses.
65
+ USER SAID: How do I add conversation branching?
66
+ ASSISTANT REPLIED: Store a parent_message_id on each message in Supabase. Query by branch to reconstruct any conversation path.
67
+ UPDATED MEMORY: User building AI chatbot with FastAPI and Supabase. Supports streaming responses and conversation branching via parent_message_id stored per message.
68
+
69
+ EXAMPLE 6:
70
+ EXISTING MEMORY: User building image processing pipeline in Python using OpenCV. Handles batch resizing.
71
+ USER SAID: I want to add face detection now.
72
+ ASSISTANT REPLIED: Use OpenCV Haar cascades or switch to mediapipe for better accuracy on varied lighting.
73
+ UPDATED MEMORY: User building image processing pipeline in Python using OpenCV. Handles batch resizing and face detection via Haar cascades or mediapipe for varied lighting.
74
+
75
+ EXAMPLE 7:
76
+ EXISTING MEMORY: (none)
77
+ USER SAID: I want to build a habit tracker mobile app using Flutter and SQLite.
78
+ ASSISTANT REPLIED: Use sqflite package for SQLite in Flutter. Store habits and daily completion records in separate tables.
79
+ UPDATED MEMORY: User building Flutter habit tracker app using sqflite for SQLite storage. Habits and daily completion records stored in separate tables.
80
+
81
+ EXAMPLE 8:
82
+ EXISTING MEMORY: User building a portfolio website with Next.js and Tailwind. Deployed on Vercel.
83
+ USER SAID: How do I add a blog section with markdown support?
84
+ ASSISTANT REPLIED: Use next-mdx-remote to parse and render markdown files. Store posts as .mdx files in a /content folder.
85
+ UPDATED MEMORY: User building portfolio website with Next.js and Tailwind deployed on Vercel. Blog section added using next-mdx-remote with .mdx files stored in /content folder.
86
+
87
+ Now do the same task.
88
+ Rules:
89
+ - Merge EXISTING MEMORY with the new conversation into one updated memory.
90
+ - Preserve all technical details: stack, frameworks, APIs, models, tools, databases, architecture decisions, unfinished tasks, user preferences.
91
+ - Never drop existing memory facts unless directly contradicted by new information.
92
+ - Write in third-person. No "you". No "I".
93
+ - Output ONLY the updated memory. No labels. No explanation. No bullet points. No extra text."""
94
+
95
+ # =========================
96
+ # REQUEST MODEL
97
+ # =========================
98
 
99
  class SummaryRequest(BaseModel):
100
  old_memory: str = ""
101
  user_message: str
102
  assistant_message: str
103
 
104
+ # =========================
105
+ # SUMMARY ENDPOINT
106
+ # =========================
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  @app.post("/generate-summary")
109
  def generate_summary(req: SummaryRequest):
110
 
111
+ old_memory = req.old_memory.strip() if req.old_memory.strip() else "(none)"
112
+ user_message = req.user_message.strip()
113
+ assistant_message = req.assistant_message.strip()[:600]
 
 
 
 
 
114
 
115
+ user_content = f"""EXISTING MEMORY: {old_memory}
116
+ USER SAID: {user_message}
117
+ ASSISTANT REPLIED: {assistant_message}
118
  UPDATED MEMORY:"""
119
 
120
  messages = [
 
122
  {"role": "user", "content": user_content},
123
  ]
124
 
125
+ # =========================
126
+ # FORMAT CHAT
127
+ # =========================
128
+
129
  text = tokenizer.apply_chat_template(
130
  messages,
131
  tokenize=False,
132
  add_generation_prompt=True
133
  )
134
 
135
+ inputs = tokenizer(
136
+ text,
137
+ return_tensors="pt"
138
+ ).to(model.device)
139
+
140
+ # =========================
141
+ # GENERATE
142
+ # =========================
143
 
144
  output = model.generate(
145
  **inputs,
 
149
  eos_token_id=tokenizer.eos_token_id,
150
  )
151
 
152
+ # =========================
153
+ # DECODE
154
+ # =========================
155
+
156
  result = tokenizer.decode(
157
  output[0][inputs.input_ids.shape[1]:],
158
  skip_special_tokens=True
159
  ).strip()
160
 
161
+ # =========================
162
+ # CLEAN OUTPUT
163
+ # =========================
164
+
165
+ stop_phrases = [
166
+ "<|im_end|>",
167
+ "<|endoftext|>",
168
+ "UPDATED MEMORY:",
169
+ "EXISTING MEMORY:",
170
+ "USER SAID:",
171
+ "ASSISTANT REPLIED:",
172
+ "EXAMPLE ",
173
+ "Now do the same",
174
+ "Assistant:",
175
+ "User:",
176
+ ]
177
+
178
+ for phrase in stop_phrases:
179
+ if phrase in result:
180
+ result = result.split(phrase)[0].strip()
181
 
182
  # Deduplicate lines
183
  seen, lines = set(), []
184
+
185
  for line in result.splitlines():
186
  line = line.strip()
187
  if line and line not in seen:
188
  seen.add(line)
189
  lines.append(line)
190
 
191
+ result = " ".join(lines).strip()
192
+
193
+ return {"memory": result}
194
 
195
+ # =========================
196
+ # HEALTH
197
+ # =========================
198
 
199
  @app.get("/")
200
  def root():
201
  return {"status": "Memory Summarizer Running 🚀"}
202
 
203
+ # =========================
204
+ # RUN
205
+ # =========================
206
 
207
  if __name__ == "__main__":
208
+ uvicorn.run(
209
+ "app:app",
210
+ host="0.0.0.0",
211
+ port=7860
212
+ )