Valtry commited on
Commit
c44f9a3
·
verified ·
1 Parent(s): 14f831d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -83
app.py CHANGED
@@ -3,6 +3,7 @@ from pydantic import BaseModel
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
  import uvicorn
 
6
 
7
  # =========================
8
  # APP
@@ -16,7 +17,7 @@ app = FastAPI()
16
 
17
  MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
18
 
19
- print("🚀 Loading Memory Summarizer...")
20
 
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
 
@@ -31,7 +32,7 @@ model = AutoModelForCausalLM.from_pretrained(
31
  print(f"✅ Loaded on {device.upper()}")
32
 
33
  # =========================
34
- # REQUEST MODEL
35
  # =========================
36
 
37
  class SummaryRequest(BaseModel):
@@ -40,78 +41,157 @@ class SummaryRequest(BaseModel):
40
  assistant_message: str
41
 
42
  # =========================
43
- # SUMMARY ENDPOINT
44
  # =========================
45
 
46
- @app.post("/generate-summary")
47
- def generate_summary(req: SummaryRequest):
48
 
49
- prompt = f"""
50
- You are a memory compression engine.
 
 
 
 
 
 
 
 
 
51
 
52
- Your job:
53
- Merge OLD MEMORY with NEW CONVERSATION into ONE updated memory.
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- IMPORTANT RULES:
56
- - Preserve ALL important technical details
57
- - Preserve frameworks, APIs, models, tools, databases
58
- - Preserve architecture decisions
59
- - Preserve project goals
60
- - Preserve unfinished tasks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  - Preserve user preferences
62
- - Remove filler and repetition
 
 
 
 
 
 
 
 
 
 
 
 
63
  - Compress intelligently
 
 
 
64
  - NEVER answer the user
65
- - NEVER explain anything
66
- - NEVER act like assistant
67
- - ONLY return compressed memory
68
 
69
- MEMORY FORMAT:
70
- - Short
71
- - Dense
72
- - Informational
73
- - Technical
74
- - Third-person style
75
 
76
- EXAMPLE:
77
 
78
- OLD MEMORY:
79
- User building AI chatbot with FastAPI.
80
 
81
- NEW USER MESSAGE:
82
- How to add Supabase memory?
83
 
84
- NEW ASSISTANT RESPONSE:
85
- Use Supabase to store conversations and summaries.
86
 
87
- UPDATED MEMORY:
88
- User building AI chatbot using FastAPI and Supabase conversation storage.
 
89
 
90
- NOW DO THE TASK.
 
91
 
92
- OLD MEMORY:
 
 
 
 
 
93
  {req.old_memory}
94
 
95
- NEW USER MESSAGE:
96
  {req.user_message}
97
 
98
- NEW ASSISTANT RESPONSE:
99
  {req.assistant_message}
100
 
101
- UPDATED MEMORY:
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  """
103
 
 
 
 
 
104
  messages = [
 
 
 
 
105
  {
106
  "role": "user",
107
- "content": prompt
108
  }
109
  ]
110
 
111
- # =========================
112
- # FORMAT CHAT
113
- # =========================
114
-
115
  text = tokenizer.apply_chat_template(
116
  messages,
117
  tokenize=False,
@@ -120,7 +200,9 @@ UPDATED MEMORY:
120
 
121
  inputs = tokenizer(
122
  text,
123
- return_tensors="pt"
 
 
124
  ).to(model.device)
125
 
126
  # =========================
@@ -130,10 +212,10 @@ UPDATED MEMORY:
130
  output = model.generate(
131
  **inputs,
132
  max_new_tokens=180,
133
- do_sample=False,
134
- temperature=0.0,
135
- top_p=1.0,
136
- repetition_penalty=1.1,
137
  eos_token_id=tokenizer.eos_token_id
138
  )
139
 
@@ -147,41 +229,10 @@ UPDATED MEMORY:
147
  )
148
 
149
  # =========================
150
- # CLEAN OUTPUT
151
  # =========================
152
 
153
- stop_words = [
154
- "<|im_end|>",
155
- "<|endoftext|>",
156
- "UPDATED MEMORY:",
157
- "Assistant:",
158
- "User:"
159
- ]
160
-
161
- for w in stop_words:
162
- if w in result:
163
- result = result.split(w)[0]
164
-
165
- result = result.strip()
166
-
167
- # remove repeated lines
168
- lines = []
169
- seen = set()
170
-
171
- for line in result.split("\n"):
172
-
173
- line = line.strip()
174
-
175
- if not line:
176
- continue
177
-
178
- if line in seen:
179
- continue
180
-
181
- seen.add(line)
182
- lines.append(line)
183
-
184
- result = " ".join(lines)
185
 
186
  return {
187
  "memory": result
@@ -193,8 +244,9 @@ UPDATED MEMORY:
193
 
194
  @app.get("/")
195
  def root():
 
196
  return {
197
- "status": "Memory Summarizer Running 🚀"
198
  }
199
 
200
  # =========================
@@ -202,6 +254,7 @@ def root():
202
  # =========================
203
 
204
  if __name__ == "__main__":
 
205
  uvicorn.run(
206
  "app:app",
207
  host="0.0.0.0",
 
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
  import uvicorn
6
+ import re
7
 
8
  # =========================
9
  # APP
 
17
 
18
  MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
19
 
20
+ print("🚀 Loading Recursive Memory Summarizer...")
21
 
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
 
 
32
  print(f"✅ Loaded on {device.upper()}")
33
 
34
  # =========================
35
+ # REQUEST
36
  # =========================
37
 
38
  class SummaryRequest(BaseModel):
 
41
  assistant_message: str
42
 
43
  # =========================
44
+ # CLEAN
45
  # =========================
46
 
47
+ def clean_output(text):
 
48
 
49
+ stop_words = [
50
+ "<|im_end|>",
51
+ "<|endoftext|>",
52
+ "<|eot_id|>",
53
+ "UPDATED_MEMORY:",
54
+ "MEMORY:",
55
+ "Assistant:",
56
+ "User:"
57
+ ]
58
+
59
+ for w in stop_words:
60
 
61
+ if w in text:
62
+ text = text.split(w)[0]
63
+
64
+ text = text.strip()
65
+
66
+ # remove duplicate lines
67
+ lines = []
68
+ seen = set()
69
+
70
+ for line in text.split("\n"):
71
+
72
+ line = line.strip()
73
+
74
+ if not line:
75
+ continue
76
 
77
+ if line in seen:
78
+ continue
79
+
80
+ seen.add(line)
81
+ lines.append(line)
82
+
83
+ text = "\n".join(lines)
84
+
85
+ # remove too many spaces
86
+ text = re.sub(r"\n+", "\n", text)
87
+
88
+ return text.strip()
89
+
90
+ # =========================
91
+ # SYSTEM PROMPT
92
+ # =========================
93
+
94
+ SYSTEM_PROMPT = """
95
+ You are a recursive AI memory summarization engine.
96
+
97
+ Your ONLY task:
98
+ Maintain long-term conversational memory.
99
+
100
+ IMPORTANT:
101
+ This memory is used by another AI model later.
102
+
103
+ GOALS:
104
+ - Preserve important discussion context
105
+ - Preserve coding discussions
106
+ - Preserve project details
107
+ - Preserve goals
108
+ - Preserve plans
109
+ - Preserve technical information
110
  - Preserve user preferences
111
+ - Preserve ongoing tasks
112
+ - Preserve implementation ideas
113
+ - Preserve important explanations
114
+
115
+ REMOVE:
116
+ - filler
117
+ - greetings
118
+ - repetition
119
+ - unnecessary wording
120
+ - casual conversation fluff
121
+
122
+ RULES:
123
+ - Merge old memory with new conversation
124
  - Compress intelligently
125
+ - Keep important meaning
126
+ - Keep memory compact
127
+ - Keep memory understandable for another AI
128
  - NEVER answer the user
129
+ - NEVER explain
130
+ - ONLY output updated memory
 
131
 
132
+ GOOD MEMORY STYLE:
 
 
 
 
 
133
 
134
+ User building local AI assistant using FastAPI and llama.cpp. Uses Supabase storage and streaming responses. Implementing recursive memory summarization and title generation using lightweight Qwen models.
135
 
136
+ BAD MEMORY STYLE:
 
137
 
138
+ The user asked this. The assistant replied this.
 
139
 
140
+ ONLY OUTPUT MEMORY.
141
+ """
142
 
143
+ # =========================
144
+ # SUMMARY ENDPOINT
145
+ # =========================
146
 
147
+ @app.post("/generate-summary")
148
+ def generate_summary(req: SummaryRequest):
149
 
150
+ # =========================
151
+ # USER PROMPT
152
+ # =========================
153
+
154
+ user_prompt = f"""
155
+ OLD_MEMORY:
156
  {req.old_memory}
157
 
158
+ NEW_USER_MESSAGE:
159
  {req.user_message}
160
 
161
+ NEW_ASSISTANT_MESSAGE:
162
  {req.assistant_message}
163
 
164
+ TASK:
165
+ Generate updated long-term memory summary.
166
+
167
+ IMPORTANT:
168
+ - Merge previous memory with new discussion
169
+ - Preserve technical/coding context
170
+ - Preserve important conversation flow
171
+ - Preserve ongoing project details
172
+ - Preserve implementation discussions
173
+ - Preserve future plans/goals
174
+ - Keep compact but meaningful
175
+ - Keep understandable for another AI model
176
+
177
+ UPDATED_MEMORY:
178
  """
179
 
180
+ # =========================
181
+ # CHAT FORMAT
182
+ # =========================
183
+
184
  messages = [
185
+ {
186
+ "role": "system",
187
+ "content": SYSTEM_PROMPT
188
+ },
189
  {
190
  "role": "user",
191
+ "content": user_prompt
192
  }
193
  ]
194
 
 
 
 
 
195
  text = tokenizer.apply_chat_template(
196
  messages,
197
  tokenize=False,
 
200
 
201
  inputs = tokenizer(
202
  text,
203
+ return_tensors="pt",
204
+ truncation=True,
205
+ max_length=4096
206
  ).to(model.device)
207
 
208
  # =========================
 
212
  output = model.generate(
213
  **inputs,
214
  max_new_tokens=180,
215
+ do_sample=True,
216
+ temperature=0.2,
217
+ top_p=0.9,
218
+ repetition_penalty=1.15,
219
  eos_token_id=tokenizer.eos_token_id
220
  )
221
 
 
229
  )
230
 
231
  # =========================
232
+ # CLEAN
233
  # =========================
234
 
235
+ result = clean_output(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  return {
238
  "memory": result
 
244
 
245
  @app.get("/")
246
  def root():
247
+
248
  return {
249
+ "status": "Recursive Memory Summarizer Running 🚀"
250
  }
251
 
252
  # =========================
 
254
  # =========================
255
 
256
  if __name__ == "__main__":
257
+
258
  uvicorn.run(
259
  "app:app",
260
  host="0.0.0.0",