Valtry commited on
Commit
d795cbb
·
verified ·
1 Parent(s): d040ad0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -234
app.py CHANGED
@@ -3,211 +3,57 @@ from pydantic import BaseModel
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
  import uvicorn
6
- import re
7
-
8
- # =========================
9
- # APP
10
- # =========================
11
 
12
  app = FastAPI()
13
 
14
- # =========================
15
- # MODEL
16
- # =========================
17
-
18
  MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
19
 
20
- print("🚀 Loading Recursive Memory Summarizer...")
21
-
22
- device = torch.device(
23
- "cuda" if torch.cuda.is_available() else "cpu"
24
- )
25
-
26
- # =========================
27
- # TOKENIZER
28
- # =========================
29
-
30
- tokenizer = AutoTokenizer.from_pretrained(
31
- MODEL_ID,
32
- trust_remote_code=True
33
- )
34
-
35
- # =========================
36
- # MODEL LOAD
37
- # =========================
38
 
 
 
39
  model = AutoModelForCausalLM.from_pretrained(
40
  MODEL_ID,
41
- trust_remote_code=True
 
42
  )
43
 
44
- model = model.to(device)
45
-
46
- print(f"✅ Loaded on {device}")
47
 
48
- # =========================
49
- # REQUEST
50
- # =========================
51
 
52
  class SummaryRequest(BaseModel):
53
  old_memory: str = ""
54
  user_message: str
55
  assistant_message: str
56
 
57
- # =========================
58
- # CLEAN OUTPUT
59
- # =========================
60
-
61
- def clean_output(text):
62
-
63
- stop_words = [
64
- "<|im_end|>",
65
- "<|endoftext|>",
66
- "<|eot_id|>",
67
- "UPDATED_MEMORY:",
68
- "MEMORY:",
69
- "Assistant:",
70
- "User:"
71
- ]
72
-
73
- for w in stop_words:
74
-
75
- if w in text:
76
- text = text.split(w)[0]
77
-
78
- text = text.strip()
79
-
80
- # remove repeated lines
81
- lines = []
82
- seen = set()
83
-
84
- for line in text.split("\n"):
85
-
86
- line = line.strip()
87
-
88
- if not line:
89
- continue
90
-
91
- if line in seen:
92
- continue
93
-
94
- seen.add(line)
95
- lines.append(line)
96
-
97
- text = "\n".join(lines)
98
-
99
- # remove extra spaces/newlines
100
- text = re.sub(r"\n+", "\n", text)
101
 
102
- return text.strip()
 
103
 
104
- # =========================
105
- # SYSTEM PROMPT
106
- # =========================
 
 
107
 
108
- SYSTEM_PROMPT = """
109
- You are a recursive AI memory summarization engine.
110
-
111
- Your ONLY task:
112
- Maintain long-term conversational memory.
113
-
114
- IMPORTANT:
115
- This memory is used later by another AI model.
116
-
117
- GOALS:
118
- - Preserve important discussion context
119
- - Preserve coding discussions
120
- - Preserve project details
121
- - Preserve technical information
122
- - Preserve implementation ideas
123
- - Preserve plans and goals
124
- - Preserve APIs/frameworks/models
125
- - Preserve architecture decisions
126
- - Preserve ongoing tasks
127
- - Preserve debugging context
128
-
129
- REMOVE:
130
- - filler
131
- - greetings
132
- - repeated information
133
- - unnecessary wording
134
- - casual conversation fluff
135
-
136
- RULES:
137
- - Merge old memory with new conversation
138
- - Compress intelligently
139
- - Keep memory compact
140
- - Keep memory understandable for another AI
141
- - NEVER answer the user
142
- - NEVER explain
143
- - ONLY output updated memory
144
-
145
- GOOD MEMORY STYLE:
146
- User building local AI assistant using FastAPI and llama.cpp. Uses Supabase storage and streaming responses. Implementing recursive memory summarization and title generation using lightweight Qwen models.
147
-
148
- BAD MEMORY STYLE:
149
- The user asked this. The assistant replied this.
150
-
151
- ONLY OUTPUT MEMORY.
152
- """
153
-
154
- # =========================
155
- # SUMMARY ENDPOINT
156
- # =========================
157
 
158
  @app.post("/generate-summary")
159
  def generate_summary(req: SummaryRequest):
160
 
161
- # =========================
162
- # TRUNCATE HUGE INPUTS
163
- # =========================
164
-
165
- old_memory = req.old_memory[-3000:]
166
- user_message = req.user_message[-1500:]
167
- assistant_message = req.assistant_message[-3000:]
168
-
169
- # =========================
170
- # USER PROMPT
171
- # =========================
172
-
173
- user_prompt = f"""
174
- OLD_MEMORY:
175
- {old_memory}
176
-
177
- NEW_USER_MESSAGE:
178
- {user_message}
179
 
180
- NEW_ASSISTANT_MESSAGE:
181
- {assistant_message}
182
 
183
- TASK:
184
- Generate updated long-term memory summary.
185
 
186
- IMPORTANT:
187
- - Merge previous memory with new discussion
188
- - Preserve coding/technical context
189
- - Preserve important conversation flow
190
- - Preserve implementation discussions
191
- - Preserve project goals/plans
192
- - Keep compact but meaningful
193
- - Keep understandable for another AI model
194
-
195
- UPDATED_MEMORY:
196
- """
197
-
198
- # =========================
199
- # CHAT FORMAT
200
- # =========================
201
 
202
  messages = [
203
- {
204
- "role": "system",
205
- "content": SYSTEM_PROMPT
206
- },
207
- {
208
- "role": "user",
209
- "content": user_prompt
210
- }
211
  ]
212
 
213
  text = tokenizer.apply_chat_template(
@@ -216,76 +62,41 @@ UPDATED_MEMORY:
216
  add_generation_prompt=True
217
  )
218
 
219
- # =========================
220
- # TOKENIZE
221
- # =========================
222
-
223
- inputs = tokenizer(
224
- text,
225
- return_tensors="pt",
226
- truncation=True,
227
- max_length=4096
228
- ).to(device)
229
-
230
- # =========================
231
- # GENERATE
232
- # =========================
233
-
234
- with torch.no_grad():
235
-
236
- output = model.generate(
237
- **inputs,
238
- max_new_tokens=120,
239
- do_sample=True,
240
- temperature=0.2,
241
- top_p=0.9,
242
- repetition_penalty=1.15,
243
- pad_token_id=tokenizer.eos_token_id,
244
- eos_token_id=tokenizer.eos_token_id
245
- )
246
 
247
- # =========================
248
- # DECODE
249
- # =========================
 
 
 
 
250
 
251
  result = tokenizer.decode(
252
  output[0][inputs.input_ids.shape[1]:],
253
  skip_special_tokens=True
254
- )
255
-
256
- # =========================
257
- # CLEAN
258
- # =========================
259
 
260
- result = clean_output(result)
 
 
 
261
 
262
- # =========================
263
- # RESPONSE
264
- # =========================
 
 
 
 
265
 
266
- return {
267
- "memory": result
268
- }
269
 
270
- # =========================
271
- # HEALTH
272
- # =========================
273
 
274
  @app.get("/")
275
  def root():
 
276
 
277
- return {
278
- "status": "Recursive Memory Summarizer Running 🚀"
279
- }
280
-
281
- # =========================
282
- # RUN
283
- # =========================
284
 
285
  if __name__ == "__main__":
286
-
287
- uvicorn.run(
288
- "app:app",
289
- host="0.0.0.0",
290
- port=7860
291
- )
 
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
  import uvicorn
 
 
 
 
 
6
 
7
  app = FastAPI()
8
 
 
 
 
 
9
  MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
10
 
11
+ print("🚀 Loading Memory Summarizer...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
  model = AutoModelForCausalLM.from_pretrained(
16
  MODEL_ID,
17
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
18
+ device_map="auto"
19
  )
20
 
21
+ print(f"✅ Loaded on {device.upper()}")
 
 
22
 
 
 
 
23
 
24
  class SummaryRequest(BaseModel):
25
  old_memory: str = ""
26
  user_message: str
27
  assistant_message: str
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ SYSTEM_PROMPT = """You are a memory compression engine.
31
+ Merge OLD MEMORY + NEW CONVERSATION into ONE updated memory blob.
32
 
33
+ Rules:
34
+ - Preserve: technical stack, frameworks, APIs, architecture decisions, project goals, unfinished tasks, user preferences
35
+ - Remove: filler, repetition, conversational fluff
36
+ - Output style: dense, third-person, bullet-free, technical
37
+ - Output ONLY the updated memory — no preamble, no explanation, no labels"""
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  @app.post("/generate-summary")
41
  def generate_summary(req: SummaryRequest):
42
 
43
+ user_content = f"""OLD MEMORY:
44
+ {req.old_memory if req.old_memory else "(none)"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ NEW USER MESSAGE:
47
+ {req.user_message}
48
 
49
+ NEW ASSISTANT RESPONSE:
50
+ {req.assistant_message}
51
 
52
+ UPDATED MEMORY:"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  messages = [
55
+ {"role": "system", "content": SYSTEM_PROMPT},
56
+ {"role": "user", "content": user_content},
 
 
 
 
 
 
57
  ]
58
 
59
  text = tokenizer.apply_chat_template(
 
62
  add_generation_prompt=True
63
  )
64
 
65
+ inputs = tokenizer(text, return_tensors="pt").to(model.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ output = model.generate(
68
+ **inputs,
69
+ max_new_tokens=200,
70
+ do_sample=False,
71
+ repetition_penalty=1.15,
72
+ eos_token_id=tokenizer.eos_token_id,
73
+ )
74
 
75
  result = tokenizer.decode(
76
  output[0][inputs.input_ids.shape[1]:],
77
  skip_special_tokens=True
78
+ ).strip()
 
 
 
 
79
 
80
+ # Strip any leaked stop tokens or role prefixes
81
+ for stop in ["<|im_end|>", "<|endoftext|>", "UPDATED MEMORY:", "User:", "Assistant:"]:
82
+ if stop in result:
83
+ result = result.split(stop)[0].strip()
84
 
85
+ # Deduplicate lines
86
+ seen, lines = set(), []
87
+ for line in result.splitlines():
88
+ line = line.strip()
89
+ if line and line not in seen:
90
+ seen.add(line)
91
+ lines.append(line)
92
 
93
+ return {"memory": " ".join(lines)}
 
 
94
 
 
 
 
95
 
96
  @app.get("/")
97
  def root():
98
+ return {"status": "Memory Summarizer Running 🚀"}
99
 
 
 
 
 
 
 
 
100
 
101
  if __name__ == "__main__":
102
+ uvicorn.run("app:app", host="0.0.0.0", port=7860)