Valtry commited on
Commit
0d2f40b
·
verified ·
1 Parent(s): 756a711

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -90
app.py CHANGED
@@ -12,111 +12,84 @@ import uvicorn
12
  app = FastAPI()
13
 
14
  # =========================
15
- # MODEL CONFIG
16
  # =========================
17
 
18
- # Swap this to upgrade intelligence:
19
- # "Qwen/Qwen2.5-0.5B-Instruct" → lightest, weakest
20
- # "Qwen/Qwen2.5-1.5B-Instruct" → recommended sweet spot
21
- # "Qwen/Qwen2.5-3B-Instruct" → best Qwen quality, tight on free tier
22
- # "HuggingFaceTB/SmolLM2-1.7B-Instruct" → good alternative
23
- # "meta-llama/Llama-3.2-1B-Instruct" → good, needs HF token
24
- # "meta-llama/Llama-3.2-3B-Instruct" → strong, needs HF token
25
- # "google/gemma-2-2b-it" → solid, needs HF token
26
-
27
  MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
28
 
29
- # Models that need a HuggingFace token (set HF_TOKEN in Space secrets)
30
- GATED_MODELS = [
31
- "meta-llama/Llama-3.2-1B-Instruct",
32
- "meta-llama/Llama-3.2-3B-Instruct",
33
- "google/gemma-2-2b-it",
34
- "microsoft/Phi-3.5-mini-instruct",
35
- ]
36
-
37
  print(f"🚀 Loading Memory Summarizer — {MODEL_ID}")
38
 
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
40
 
41
- import os
42
- hf_token = os.environ.get("HF_TOKEN", None)
43
- use_token = hf_token if any(m in MODEL_ID for m in GATED_MODELS) else None
44
-
45
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=use_token)
46
 
47
  model = AutoModelForCausalLM.from_pretrained(
48
  MODEL_ID,
49
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
50
- device_map="auto",
51
- token=use_token
52
  )
53
 
54
- print(f"✅ Loaded {MODEL_ID} on {device.upper()}")
55
 
56
  # =========================
57
  # SYSTEM PROMPT
58
  # =========================
59
 
60
- SYSTEM_PROMPT = """You are a memory compression engine. Your only job is to merge facts.
61
 
62
  EXAMPLE 1:
63
  EXISTING MEMORY: (none)
64
- USER SAID: I am building a chat app with Node.js and MongoDB.
65
- ASSISTANT REPLIED: Use Socket.io rooms. Store messages with roomId and timestamp.
66
- UPDATED MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp.
67
 
68
  EXAMPLE 2:
69
- EXISTING MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp.
70
- USER SAID: How do I add authentication?
71
- ASSISTANT REPLIED: Use JWT. Verify token on every Socket.io connection via middleware.
72
- UPDATED MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp. Auth via JWT verified on Socket.io connections through middleware.
73
 
74
  EXAMPLE 3:
75
- EXISTING MEMORY: User building REST API with FastAPI and PostgreSQL. JWT auth implemented.
76
- USER SAID: How do I add rate limiting?
77
- ASSISTANT REPLIED: Use slowapi. Attach limiter to app and decorate routes.
78
- UPDATED MEMORY: User building REST API with FastAPI and PostgreSQL. JWT auth implemented. Rate limiting via slowapi on routes.
79
 
80
  EXAMPLE 4:
81
- EXISTING MEMORY: User building SaaS dashboard with Next.js and FastAPI. PostgreSQL for database.
82
- USER SAID: Should I use REST or GraphQL?
83
- ASSISTANT REPLIED: Use REST for fixed data shapes. GraphQL for flexible querying.
84
- UPDATED MEMORY: User building SaaS dashboard with Next.js and FastAPI using PostgreSQL. Chose REST over GraphQL due to fixed data shapes.
85
 
86
  EXAMPLE 5:
87
- EXISTING MEMORY: User building Python scraper with BeautifulSoup. Stores results in CSV.
88
- USER SAID: My scraper gets blocked after 50 requests.
89
- ASSISTANT REPLIED: Add random delays, rotate user-agent headers, use proxy pool.
90
- UPDATED MEMORY: User building Python scraper with BeautifulSoup storing results in CSV. Anti-blocking via random delays, rotating user-agent headers, and proxy pool.
91
 
92
  EXAMPLE 6:
93
- EXISTING MEMORY: User building mobile app in React Native with Firebase.
94
- USER SAID: I am switching from Firebase to Supabase.
95
- ASSISTANT REPLIED: Replace Firebase Auth with Supabase Auth. Replace Firestore with Supabase PostgreSQL.
96
- UPDATED MEMORY: User building mobile app in React Native. Switched from Firebase to Supabase. Auth via Supabase Auth, database via Supabase PostgreSQL.
97
 
98
  EXAMPLE 7:
99
- EXISTING MEMORY: User building e-commerce site with Django and Stripe. Cart and product pages done. Checkout pending.
100
- USER SAID: How do I send order confirmation emails?
101
- ASSISTANT REPLIED: Use Django send_mail or SendGrid. Trigger inside Stripe webhook on payment_intent.succeeded.
102
- UPDATED MEMORY: User building e-commerce site with Django and Stripe. Cart and product pages done. Checkout pending. Order confirmation emails via SendGrid triggered on payment_intent.succeeded inside Stripe webhook.
103
-
104
- EXAMPLE 8:
105
- EXISTING MEMORY: User building local AI assistant with FastAPI and llama.cpp. Supports streaming and branching conversations.
106
- USER SAID: I want to add long-term memory to avoid token limit issues.
107
- ASSISTANT REPLIED: Use Qwen2.5-0.5B to recursively summarize memory. Store in Supabase. Inject before recent chat history. Truncate large responses before summarizing.
108
- UPDATED MEMORY: User building local AI assistant with FastAPI and llama.cpp. Supports streaming and branching conversations. Long-term memory via Qwen2.5-0.5B recursive summarization stored in Supabase, injected before recent history. Large responses truncated before summarizing.
109
 
110
  STRICT RULES:
111
  - Output ONLY the updated memory. No labels. No preamble. No explanation.
112
- - Keep ALL facts from EXISTING MEMORY unless directly contradicted.
113
- - Add only new facts from USER SAID and ASSISTANT REPLIED.
114
- - No filler: no "ensuring", "enhances", "maintaining", "this setup", "this approach".
 
115
  - No questions. No advice. No "you". No "I".
116
- - One short dense paragraph only."""
117
 
118
  # =========================
119
- # FILLER PHRASES TO STRIP
120
  # =========================
121
 
122
  FILLER_PATTERNS = [
@@ -126,8 +99,66 @@ FILLER_PATTERNS = [
126
  r"enhances\s[^.]*\.",
127
  r"This (ensures|allows|enables|provides|helps|makes|improves)\s[^.]*\.",
128
  r"for (better|improved|efficient|effective|optimal)\s[^.]*\.",
 
 
129
  ]
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  # =========================
132
  # REQUEST MODEL
133
  # =========================
@@ -146,7 +177,7 @@ def generate_summary(req: SummaryRequest):
146
 
147
  old_memory = req.old_memory.strip() if req.old_memory.strip() else "(none)"
148
  user_message = req.user_message.strip()
149
- assistant_message = req.assistant_message.strip()[:500]
150
 
151
  user_content = f"""EXISTING MEMORY: {old_memory}
152
  USER SAID: {user_message}
@@ -158,10 +189,6 @@ UPDATED MEMORY:"""
158
  {"role": "user", "content": user_content},
159
  ]
160
 
161
- # =========================
162
- # FORMAT CHAT
163
- # =========================
164
-
165
  text = tokenizer.apply_chat_template(
166
  messages,
167
  tokenize=False,
@@ -173,22 +200,14 @@ UPDATED MEMORY:"""
173
  return_tensors="pt"
174
  ).to(model.device)
175
 
176
- # =========================
177
- # GENERATE
178
- # =========================
179
-
180
  output = model.generate(
181
  **inputs,
182
- max_new_tokens=200,
183
  do_sample=False,
184
  repetition_penalty=1.15,
185
  eos_token_id=tokenizer.eos_token_id,
186
  )
187
 
188
- # =========================
189
- # DECODE
190
- # =========================
191
-
192
  result = tokenizer.decode(
193
  output[0][inputs.input_ids.shape[1]:],
194
  skip_special_tokens=True
@@ -230,12 +249,13 @@ UPDATED MEMORY:"""
230
  lines.append(line)
231
 
232
  result = " ".join(lines).strip()
 
233
 
234
  # =========================
235
- # CLEAN fix double spaces
236
  # =========================
237
 
238
- result = re.sub(r"\s{2,}", " ", result).strip()
239
 
240
  return {"memory": result}
241
 
@@ -251,13 +271,5 @@ def root():
251
  "device": device.upper()
252
  }
253
 
254
- # =========================
255
- # RUN
256
- # =========================
257
-
258
  if __name__ == "__main__":
259
- uvicorn.run(
260
- "app:app",
261
- host="0.0.0.0",
262
- port=7860
263
- )
 
12
  app = FastAPI()
13
 
14
  # =========================
15
+ # MODEL
16
  # =========================
17
 
 
 
 
 
 
 
 
 
 
18
  MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
19
 
 
 
 
 
 
 
 
 
20
  print(f"🚀 Loading Memory Summarizer — {MODEL_ID}")
21
 
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
 
24
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 
 
 
25
 
26
  model = AutoModelForCausalLM.from_pretrained(
27
  MODEL_ID,
28
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
29
+ device_map="auto"
 
30
  )
31
 
32
+ print(f"✅ Loaded on {device.upper()}")
33
 
34
  # =========================
35
  # SYSTEM PROMPT
36
  # =========================
37
 
38
+ SYSTEM_PROMPT = """You are a memory compression engine. Compress and merge facts into one short dense paragraph.
39
 
40
  EXAMPLE 1:
41
  EXISTING MEMORY: (none)
42
+ USER SAID: I am building a weather app using React and OpenWeatherMap API.
43
+ ASSISTANT REPLIED: Fetch data with axios. Store API key in .env via process.env.
44
+ UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env.
45
 
46
  EXAMPLE 2:
47
+ EXISTING MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env.
48
+ USER SAID: How do I cache the weather data so I do not hit the API limit?
49
+ ASSISTANT REPLIED: Use localStorage to cache responses with a timestamp. If cache is under 10 minutes old, return it instead of calling the API.
50
+ UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios, cached in localStorage with 10-minute expiry to avoid API limit.
51
 
52
  EXAMPLE 3:
53
+ EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location.
54
+ USER SAID: How do job seekers apply for a job?
55
+ ASSISTANT REPLIED: Create Application model with ForeignKey to Job and User, status field, resume FileField in S3.
56
+ UPDATED MEMORY: User building job board with Django, React, PostgreSQL, JWT auth. Custom user model with company/jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field, resume stored in S3.
57
 
58
  EXAMPLE 4:
59
+ EXISTING MEMORY: User building job board with Django, React, PostgreSQL, JWT auth. Custom user model with company/jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field, resume stored in S3.
60
+ USER SAID: I want to add search and filters for title, location, and salary range.
61
+ ASSISTANT REPLIED: Use Django Q objects and django-filter. Add query params to job list endpoint.
62
+ UPDATED MEMORY: User building job board with Django, React, PostgreSQL, JWT auth. Company/jobseeker roles. Job and Application models complete with S3 resumes. Job search via django-filter and Q objects on title, location, salary range.
63
 
64
  EXAMPLE 5:
65
+ EXISTING MEMORY: User building job board with Django, React, PostgreSQL, JWT auth. Company/jobseeker roles. Job and Application models complete with S3 resumes. Job search via django-filter and Q objects on title, location, salary range.
66
+ USER SAID: How do I notify applicants when status changes?
67
+ ASSISTANT REPLIED: Use Django signals on Application post_save. Trigger SendGrid email via Celery async task.
68
+ UPDATED MEMORY: User building job board with Django, React, PostgreSQL, JWT auth, Celery, SendGrid. Company/jobseeker roles. Job and Application models with S3 resumes and django-filter search. Status change notifications via Django signals and Celery tasks.
69
 
70
  EXAMPLE 6:
71
+ EXISTING MEMORY: User building job board with Django, React, PostgreSQL, JWT auth, Celery, SendGrid. Company/jobseeker roles. Job and Application models with S3 resumes and django-filter search. Status change notifications via Django signals and Celery tasks.
72
+ USER SAID: How do I deploy this on a VPS?
73
+ ASSISTANT REPLIED: Docker Compose with Django, React, PostgreSQL, Redis, Celery services. Gunicorn behind nginx. Certbot for SSL.
74
+ UPDATED MEMORY: User building job board with Django, React, PostgreSQL, JWT auth, Celery, SendGrid, Redis. Company/jobseeker roles. Job and Application models with S3 resumes and django-filter search. Status notifications via Django signals. Deployed via Docker Compose with Gunicorn, nginx, Certbot SSL.
75
 
76
  EXAMPLE 7:
77
+ EXISTING MEMORY: User building job board with Django, React, PostgreSQL, JWT auth, Celery, SendGrid, Redis. Company/jobseeker roles. Job and Application models with S3 resumes and django-filter search. Status notifications via Django signals. Deployed via Docker Compose with Gunicorn, nginx, Certbot SSL.
78
+ USER SAID: What is still left to build?
79
+ ASSISTANT REPLIED: Admin panel, pagination, rate limiting, frontend loading states and error handling.
80
+ UPDATED MEMORY: User building job board with Django, React, PostgreSQL, JWT auth, Celery, SendGrid, Redis. Company/jobseeker roles. Job, Application models with S3 resumes, django-filter search, Docker Compose deployment. Pending: admin panel, pagination, rate limiting, frontend loading states and error handling.
 
 
 
 
 
 
81
 
82
  STRICT RULES:
83
  - Output ONLY the updated memory. No labels. No preamble. No explanation.
84
+ - COMPRESS the existing memory. Do not copy it verbatim. Rewrite it shorter.
85
+ - Keep ALL technical facts. Remove only filler words.
86
+ - Add new facts merged in, not appended as separate sentences.
87
+ - No filler: no "ensuring", "enhances", "this setup", "this approach", "in order to".
88
  - No questions. No advice. No "you". No "I".
89
+ - One short dense paragraph. Maximum 3 sentences."""
90
 
91
  # =========================
92
+ # FILLER PATTERNS
93
  # =========================
94
 
95
  FILLER_PATTERNS = [
 
99
  r"enhances\s[^.]*\.",
100
  r"This (ensures|allows|enables|provides|helps|makes|improves)\s[^.]*\.",
101
  r"for (better|improved|efficient|effective|optimal)\s[^.]*\.",
102
+ r"in order to\s[^.]*\.",
103
+ r"To (enhance|improve|ensure|enable)\s[^.]*\.",
104
  ]
105
 
106
+ # =========================
107
+ # HELPERS
108
+ # =========================
109
+
110
+ def clean_assistant_message(text: str) -> str:
111
+ """
112
+ Strip code blocks from assistant responses.
113
+ Extract function/class names and key terms before removing.
114
+ Keep only prose explanation, cap at 500 chars.
115
+ """
116
+ # Extract key identifiers from code before removing
117
+ code_blocks = re.findall(r"```[\w]*\n?(.*?)```", text, re.DOTALL)
118
+ extracted_terms = []
119
+
120
+ for block in code_blocks:
121
+ # Grab function/class/variable names
122
+ names = re.findall(
123
+ r"(?:def|class|const|let|var|function)\s+(\w+)", block
124
+ )
125
+ extracted_terms.extend(names)
126
+
127
+ # Remove code blocks
128
+ text = re.sub(r"```[\w]*\n?.*?```", "", text, flags=re.DOTALL)
129
+
130
+ # Remove inline code but keep the text
131
+ text = re.sub(r"`([^`]+)`", r"\1", text)
132
+
133
+ # Append extracted key names if any
134
+ if extracted_terms:
135
+ text += " Key identifiers: " + ", ".join(extracted_terms) + "."
136
+
137
+ # Collapse whitespace
138
+ text = re.sub(r"\s{2,}", " ", text).strip()
139
+
140
+ return text[:500]
141
+
142
+
143
+ def enforce_memory_limit(text: str, max_chars: int = 600) -> str:
144
+ """
145
+ Hard cap on memory length.
146
+ If over limit, keep complete sentences up to the limit.
147
+ """
148
+ if len(text) <= max_chars:
149
+ return text
150
+
151
+ sentences = re.split(r"(?<=[.!?])\s+", text)
152
+ result = ""
153
+
154
+ for sentence in sentences:
155
+ if len(result) + len(sentence) + 1 <= max_chars:
156
+ result += ("" if not result else " ") + sentence
157
+ else:
158
+ break
159
+
160
+ return result.strip()
161
+
162
  # =========================
163
  # REQUEST MODEL
164
  # =========================
 
177
 
178
  old_memory = req.old_memory.strip() if req.old_memory.strip() else "(none)"
179
  user_message = req.user_message.strip()
180
+ assistant_message = clean_assistant_message(req.assistant_message)
181
 
182
  user_content = f"""EXISTING MEMORY: {old_memory}
183
  USER SAID: {user_message}
 
189
  {"role": "user", "content": user_content},
190
  ]
191
 
 
 
 
 
192
  text = tokenizer.apply_chat_template(
193
  messages,
194
  tokenize=False,
 
200
  return_tensors="pt"
201
  ).to(model.device)
202
 
 
 
 
 
203
  output = model.generate(
204
  **inputs,
205
+ max_new_tokens=220,
206
  do_sample=False,
207
  repetition_penalty=1.15,
208
  eos_token_id=tokenizer.eos_token_id,
209
  )
210
 
 
 
 
 
211
  result = tokenizer.decode(
212
  output[0][inputs.input_ids.shape[1]:],
213
  skip_special_tokens=True
 
249
  lines.append(line)
250
 
251
  result = " ".join(lines).strip()
252
+ result = re.sub(r"\s{2,}", " ", result).strip()
253
 
254
  # =========================
255
+ # HARD MEMORY LENGTH CAP
256
  # =========================
257
 
258
+ result = enforce_memory_limit(result, max_chars=600)
259
 
260
  return {"memory": result}
261
 
 
271
  "device": device.upper()
272
  }
273
 
 
 
 
 
274
  if __name__ == "__main__":
275
+ uvicorn.run("app:app", host="0.0.0.0", port=7860)