Valtry commited on
Commit
756a711
Β·
verified Β·
1 Parent(s): 52b7f3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -62
app.py CHANGED
@@ -2,6 +2,7 @@ from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
 
5
  import uvicorn
6
 
7
  # =========================
@@ -11,86 +12,121 @@ import uvicorn
11
  app = FastAPI()
12
 
13
  # =========================
14
- # MODEL
15
  # =========================
16
 
17
- MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
 
 
 
 
 
 
 
18
 
19
- print("πŸš€ Loading Memory Summarizer...")
 
 
 
 
 
 
 
 
 
 
20
 
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
 
23
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 
 
 
24
 
25
  model = AutoModelForCausalLM.from_pretrained(
26
  MODEL_ID,
27
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
28
- device_map="auto"
 
29
  )
30
 
31
- print(f"βœ… Loaded on {device.upper()}")
32
 
33
  # =========================
34
  # SYSTEM PROMPT
35
  # =========================
36
 
37
- SYSTEM_PROMPT = """You are a memory compression engine.
38
 
39
  EXAMPLE 1:
40
- EXISTING MEMORY: User building a todo app with React and Firebase.
41
- USER SAID: Can I add offline support?
42
- ASSISTANT REPLIED: Use Firebase offline persistence by enabling it in the SDK config.
43
- UPDATED MEMORY: User building todo app with React and Firebase. Offline persistence enabled via Firebase SDK config.
44
 
45
  EXAMPLE 2:
46
- EXISTING MEMORY: User building REST API with FastAPI and PostgreSQL. Using JWT for auth.
47
- USER SAID: How do I add rate limiting?
48
- ASSISTANT REPLIED: Use slowapi library with FastAPI. Attach the limiter to the app instance and decorate routes.
49
- UPDATED MEMORY: User building REST API with FastAPI and PostgreSQL using JWT auth and slowapi-based rate limiting on routes.
50
 
51
  EXAMPLE 3:
52
- EXISTING MEMORY: User building CLI tool in Python to rename files in bulk. Uses argparse and pathlib.
53
- USER SAID: I want to add a dry-run mode that shows changes without applying them.
54
- ASSISTANT REPLIED: Add a --dry-run flag via argparse. When set, print the rename operations instead of executing them.
55
- UPDATED MEMORY: User building Python CLI bulk rename tool using argparse and pathlib. Supports dry-run mode via --dry-run flag that prints operations without executing.
56
 
57
  EXAMPLE 4:
58
- EXISTING MEMORY: (none)
59
- USER SAID: I am building an e-commerce backend with Django and Stripe for payments.
60
- ASSISTANT REPLIED: Use stripe-python SDK directly. Handle webhooks via a dedicated endpoint with signature verification.
61
- UPDATED MEMORY: User building e-commerce backend with Django and Stripe. Payments via stripe-python SDK with webhook endpoint using signature verification.
62
 
63
  EXAMPLE 5:
64
- EXISTING MEMORY: User building AI chatbot with FastAPI and Supabase for storage. Supports streaming responses.
65
- USER SAID: How do I add conversation branching?
66
- ASSISTANT REPLIED: Store a parent_message_id on each message in Supabase. Query by branch to reconstruct any conversation path.
67
- UPDATED MEMORY: User building AI chatbot with FastAPI and Supabase. Supports streaming responses and conversation branching via parent_message_id stored per message.
68
 
69
  EXAMPLE 6:
70
- EXISTING MEMORY: User building image processing pipeline in Python using OpenCV. Handles batch resizing.
71
- USER SAID: I want to add face detection now.
72
- ASSISTANT REPLIED: Use OpenCV Haar cascades or switch to mediapipe for better accuracy on varied lighting.
73
- UPDATED MEMORY: User building image processing pipeline in Python using OpenCV. Handles batch resizing and face detection via Haar cascades or mediapipe for varied lighting.
74
 
75
  EXAMPLE 7:
76
- EXISTING MEMORY: (none)
77
- USER SAID: I want to build a habit tracker mobile app using Flutter and SQLite.
78
- ASSISTANT REPLIED: Use sqflite package for SQLite in Flutter. Store habits and daily completion records in separate tables.
79
- UPDATED MEMORY: User building Flutter habit tracker app using sqflite for SQLite storage. Habits and daily completion records stored in separate tables.
80
 
81
  EXAMPLE 8:
82
- EXISTING MEMORY: User building a portfolio website with Next.js and Tailwind. Deployed on Vercel.
83
- USER SAID: How do I add a blog section with markdown support?
84
- ASSISTANT REPLIED: Use next-mdx-remote to parse and render markdown files. Store posts as .mdx files in a /content folder.
85
- UPDATED MEMORY: User building portfolio website with Next.js and Tailwind deployed on Vercel. Blog section added using next-mdx-remote with .mdx files stored in /content folder.
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- Now do the same task.
88
- Rules:
89
- - Merge EXISTING MEMORY with the new conversation into one updated memory.
90
- - Preserve all technical details: stack, frameworks, APIs, models, tools, databases, architecture decisions, unfinished tasks, user preferences.
91
- - Never drop existing memory facts unless directly contradicted by new information.
92
- - Write in third-person. No "you". No "I".
93
- - Output ONLY the updated memory. No labels. No explanation. No bullet points. No extra text."""
 
94
 
95
  # =========================
96
  # REQUEST MODEL
@@ -108,9 +144,9 @@ class SummaryRequest(BaseModel):
108
  @app.post("/generate-summary")
109
  def generate_summary(req: SummaryRequest):
110
 
111
- old_memory = req.old_memory.strip() if req.old_memory.strip() else "(none)"
112
- user_message = req.user_message.strip()
113
- assistant_message = req.assistant_message.strip()[:600]
114
 
115
  user_content = f"""EXISTING MEMORY: {old_memory}
116
  USER SAID: {user_message}
@@ -159,27 +195,32 @@ UPDATED MEMORY:"""
159
  ).strip()
160
 
161
  # =========================
162
- # CLEAN OUTPUT
163
  # =========================
164
 
165
  stop_phrases = [
166
- "<|im_end|>",
167
- "<|endoftext|>",
168
- "UPDATED MEMORY:",
169
- "EXISTING MEMORY:",
170
- "USER SAID:",
171
- "ASSISTANT REPLIED:",
172
- "EXAMPLE ",
173
- "Now do the same",
174
- "Assistant:",
175
- "User:",
176
  ]
177
 
178
  for phrase in stop_phrases:
179
  if phrase in result:
180
  result = result.split(phrase)[0].strip()
181
 
182
- # Deduplicate lines
 
 
 
 
 
 
 
 
 
 
183
  seen, lines = set(), []
184
 
185
  for line in result.splitlines():
@@ -190,6 +231,12 @@ UPDATED MEMORY:"""
190
 
191
  result = " ".join(lines).strip()
192
 
 
 
 
 
 
 
193
  return {"memory": result}
194
 
195
  # =========================
@@ -198,7 +245,11 @@ UPDATED MEMORY:"""
198
 
199
  @app.get("/")
200
  def root():
201
- return {"status": "Memory Summarizer Running πŸš€"}
 
 
 
 
202
 
203
  # =========================
204
  # RUN
 
2
  from pydantic import BaseModel
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
+ import re
6
  import uvicorn
7
 
8
  # =========================
 
12
  app = FastAPI()
13
 
14
  # =========================
15
+ # MODEL CONFIG
16
  # =========================
17
 
18
+ # Swap this to upgrade intelligence:
19
+ # "Qwen/Qwen2.5-0.5B-Instruct" β†’ lightest, weakest
20
+ # "Qwen/Qwen2.5-1.5B-Instruct" β†’ recommended sweet spot
21
+ # "Qwen/Qwen2.5-3B-Instruct" β†’ best Qwen quality, tight on free tier
22
+ # "HuggingFaceTB/SmolLM2-1.7B-Instruct" β†’ good alternative
23
+ # "meta-llama/Llama-3.2-1B-Instruct" β†’ good, needs HF token
24
+ # "meta-llama/Llama-3.2-3B-Instruct" β†’ strong, needs HF token
25
+ # "google/gemma-2-2b-it" β†’ solid, needs HF token
26
 
27
+ MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
28
+
29
+ # Models that need a HuggingFace token (set HF_TOKEN in Space secrets)
30
+ GATED_MODELS = [
31
+ "meta-llama/Llama-3.2-1B-Instruct",
32
+ "meta-llama/Llama-3.2-3B-Instruct",
33
+ "google/gemma-2-2b-it",
34
+ "microsoft/Phi-3.5-mini-instruct",
35
+ ]
36
+
37
+ print(f"πŸš€ Loading Memory Summarizer β€” {MODEL_ID}")
38
 
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
40
 
41
+ import os
42
+ hf_token = os.environ.get("HF_TOKEN", None)
43
+ use_token = hf_token if any(m in MODEL_ID for m in GATED_MODELS) else None
44
+
45
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=use_token)
46
 
47
  model = AutoModelForCausalLM.from_pretrained(
48
  MODEL_ID,
49
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
50
+ device_map="auto",
51
+ token=use_token
52
  )
53
 
54
+ print(f"βœ… Loaded {MODEL_ID} on {device.upper()}")
55
 
56
  # =========================
57
  # SYSTEM PROMPT
58
  # =========================
59
 
60
+ SYSTEM_PROMPT = """You are a memory compression engine. Your only job is to merge facts.
61
 
62
  EXAMPLE 1:
63
+ EXISTING MEMORY: (none)
64
+ USER SAID: I am building a chat app with Node.js and MongoDB.
65
+ ASSISTANT REPLIED: Use Socket.io rooms. Store messages with roomId and timestamp.
66
+ UPDATED MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp.
67
 
68
  EXAMPLE 2:
69
+ EXISTING MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp.
70
+ USER SAID: How do I add authentication?
71
+ ASSISTANT REPLIED: Use JWT. Verify token on every Socket.io connection via middleware.
72
+ UPDATED MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp. Auth via JWT verified on Socket.io connections through middleware.
73
 
74
  EXAMPLE 3:
75
+ EXISTING MEMORY: User building REST API with FastAPI and PostgreSQL. JWT auth implemented.
76
+ USER SAID: How do I add rate limiting?
77
+ ASSISTANT REPLIED: Use slowapi. Attach limiter to app and decorate routes.
78
+ UPDATED MEMORY: User building REST API with FastAPI and PostgreSQL. JWT auth implemented. Rate limiting via slowapi on routes.
79
 
80
  EXAMPLE 4:
81
+ EXISTING MEMORY: User building SaaS dashboard with Next.js and FastAPI. PostgreSQL for database.
82
+ USER SAID: Should I use REST or GraphQL?
83
+ ASSISTANT REPLIED: Use REST for fixed data shapes. GraphQL for flexible querying.
84
+ UPDATED MEMORY: User building SaaS dashboard with Next.js and FastAPI using PostgreSQL. Chose REST over GraphQL due to fixed data shapes.
85
 
86
  EXAMPLE 5:
87
+ EXISTING MEMORY: User building Python scraper with BeautifulSoup. Stores results in CSV.
88
+ USER SAID: My scraper gets blocked after 50 requests.
89
+ ASSISTANT REPLIED: Add random delays, rotate user-agent headers, use proxy pool.
90
+ UPDATED MEMORY: User building Python scraper with BeautifulSoup storing results in CSV. Anti-blocking via random delays, rotating user-agent headers, and proxy pool.
91
 
92
  EXAMPLE 6:
93
+ EXISTING MEMORY: User building mobile app in React Native with Firebase.
94
+ USER SAID: I am switching from Firebase to Supabase.
95
+ ASSISTANT REPLIED: Replace Firebase Auth with Supabase Auth. Replace Firestore with Supabase PostgreSQL.
96
+ UPDATED MEMORY: User building mobile app in React Native. Switched from Firebase to Supabase. Auth via Supabase Auth, database via Supabase PostgreSQL.
97
 
98
  EXAMPLE 7:
99
+ EXISTING MEMORY: User building e-commerce site with Django and Stripe. Cart and product pages done. Checkout pending.
100
+ USER SAID: How do I send order confirmation emails?
101
+ ASSISTANT REPLIED: Use Django send_mail or SendGrid. Trigger inside Stripe webhook on payment_intent.succeeded.
102
+ UPDATED MEMORY: User building e-commerce site with Django and Stripe. Cart and product pages done. Checkout pending. Order confirmation emails via SendGrid triggered on payment_intent.succeeded inside Stripe webhook.
103
 
104
  EXAMPLE 8:
105
+ EXISTING MEMORY: User building local AI assistant with FastAPI and llama.cpp. Supports streaming and branching conversations.
106
+ USER SAID: I want to add long-term memory to avoid token limit issues.
107
+ ASSISTANT REPLIED: Use Qwen2.5-0.5B to recursively summarize memory. Store in Supabase. Inject before recent chat history. Truncate large responses before summarizing.
108
+ UPDATED MEMORY: User building local AI assistant with FastAPI and llama.cpp. Supports streaming and branching conversations. Long-term memory via Qwen2.5-0.5B recursive summarization stored in Supabase, injected before recent history. Large responses truncated before summarizing.
109
+
110
+ STRICT RULES:
111
+ - Output ONLY the updated memory. No labels. No preamble. No explanation.
112
+ - Keep ALL facts from EXISTING MEMORY unless directly contradicted.
113
+ - Add only new facts from USER SAID and ASSISTANT REPLIED.
114
+ - No filler: no "ensuring", "enhances", "maintaining", "this setup", "this approach".
115
+ - No questions. No advice. No "you". No "I".
116
+ - One short dense paragraph only."""
117
+
118
+ # =========================
119
+ # FILLER PHRASES TO STRIP
120
+ # =========================
121
 
122
+ FILLER_PATTERNS = [
123
+ r"This (setup|approach|system|solution|architecture|method|design)\b[^.]*\.",
124
+ r"ensuring\s[^.]*\.",
125
+ r"while maintaining\s[^.]*\.",
126
+ r"enhances\s[^.]*\.",
127
+ r"This (ensures|allows|enables|provides|helps|makes|improves)\s[^.]*\.",
128
+ r"for (better|improved|efficient|effective|optimal)\s[^.]*\.",
129
+ ]
130
 
131
  # =========================
132
  # REQUEST MODEL
 
144
  @app.post("/generate-summary")
145
  def generate_summary(req: SummaryRequest):
146
 
147
+ old_memory = req.old_memory.strip() if req.old_memory.strip() else "(none)"
148
+ user_message = req.user_message.strip()
149
+ assistant_message = req.assistant_message.strip()[:500]
150
 
151
  user_content = f"""EXISTING MEMORY: {old_memory}
152
  USER SAID: {user_message}
 
195
  ).strip()
196
 
197
  # =========================
198
+ # CLEAN β€” stop phrases
199
  # =========================
200
 
201
  stop_phrases = [
202
+ "<|im_end|>", "<|endoftext|>",
203
+ "UPDATED MEMORY:", "EXISTING MEMORY:",
204
+ "USER SAID:", "ASSISTANT REPLIED:",
205
+ "STRICT RULES:", "EXAMPLE ",
206
+ "Assistant:", "User:",
 
 
 
 
 
207
  ]
208
 
209
  for phrase in stop_phrases:
210
  if phrase in result:
211
  result = result.split(phrase)[0].strip()
212
 
213
+ # =========================
214
+ # CLEAN β€” strip filler
215
+ # =========================
216
+
217
+ for pattern in FILLER_PATTERNS:
218
+ result = re.sub(pattern, "", result, flags=re.IGNORECASE)
219
+
220
+ # =========================
221
+ # CLEAN β€” deduplicate lines
222
+ # =========================
223
+
224
  seen, lines = set(), []
225
 
226
  for line in result.splitlines():
 
231
 
232
  result = " ".join(lines).strip()
233
 
234
+ # =========================
235
+ # CLEAN β€” fix double spaces
236
+ # =========================
237
+
238
+ result = re.sub(r"\s{2,}", " ", result).strip()
239
+
240
  return {"memory": result}
241
 
242
  # =========================
 
245
 
246
  @app.get("/")
247
  def root():
248
+ return {
249
+ "status": "Memory Summarizer Running πŸš€",
250
+ "model": MODEL_ID,
251
+ "device": device.upper()
252
+ }
253
 
254
  # =========================
255
  # RUN