from fastapi import FastAPI from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM import torch import re import uvicorn # ========================= # APP # ========================= app = FastAPI() # ========================= # MODEL # ========================= MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" print(f"🚀 Loading Memory Summarizer — {MODEL_ID}") device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, dtype=torch.float16 if device == "cuda" else torch.float32, ) model = model.to(device) print(f"✅ Loaded on {device.upper()}") # ========================= # SYSTEM PROMPT # ========================= SYSTEM_PROMPT = """You are a memory compression engine. Compress and merge facts into dense paragraphs. EXAMPLE 1: EXISTING MEMORY: (none) USER SAID: I am building a weather app using React and OpenWeatherMap API. ASSISTANT REPLIED: Fetch data with axios. Store API key in .env via process.env. UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env via process.env. EXAMPLE 2: EXISTING MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env via process.env. USER SAID: How do I cache the weather data so I do not hit the API limit? ASSISTANT REPLIED: Use localStorage to cache responses with a timestamp. If cache is under 10 minutes old, return it instead of calling the API. UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key in .env. Responses cached in localStorage with 10-minute timestamp expiry to avoid API rate limit. EXAMPLE 3: EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location. USER SAID: How do job seekers apply for a job? ASSISTANT REPLIED: Create Application model with ForeignKey to Job and User, status field (applied, reviewed, rejected, accepted), resume FileField stored in S3. UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field (applied/reviewed/rejected/accepted), resume FileField stored in S3. EXAMPLE 4: EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field (applied/reviewed/rejected/accepted), resume FileField stored in S3. USER SAID: I want to add search and filters for title, location, and salary range. ASSISTANT REPLIED: Use Django Q objects and django-filter. Add query params title, location, salary_min, salary_max to job list endpoint. UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search and filtering via django-filter and Q objects on title, location, salary_min, salary_max query params. EXAMPLE 5: EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search and filtering via django-filter and Q objects on title, location, salary_min, salary_max query params. USER SAID: How do I notify applicants when their application status changes? ASSISTANT REPLIED: Use Django signals. On Application post_save, detect status change and trigger email via Celery async task using SendGrid. UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field (applied/reviewed/rejected/accepted) and S3 resume. Job search via django-filter and Q objects. Status change notifications triggered via Django signals on Application post_save, sending emails via Celery tasks and SendGrid. EXAMPLE 6: EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field (applied/reviewed/rejected/accepted) and S3 resume. Job search via django-filter and Q objects. Status change notifications triggered via Django signals on Application post_save, sending emails via Celery tasks and SendGrid. USER SAID: How do I deploy this on a VPS? ASSISTANT REPLIED: Docker Compose with services for Django, React, PostgreSQL, Redis, Celery. Serve Django via Gunicorn behind nginx. Certbot for SSL. Secrets in .env file. UPDATED MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx reverse proxy, Certbot SSL, secrets in .env. EXAMPLE 7: EXISTING MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx reverse proxy, Certbot SSL, secrets in .env. USER SAID: What is still left to build? ASSISTANT REPLIED: Admin panel for moderating posts, pagination on job listings, rate limiting on API, frontend loading states and error handling. UPDATED MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx, Certbot SSL. Pending: admin moderation panel, pagination, API rate limiting, frontend loading states and error handling. STRICT RULES: - Output ONLY the updated memory. No labels. No preamble. No explanation. - COMPRESS the existing memory. Do not copy it verbatim. Rewrite it shorter and denser. - Keep ALL technical facts: stack, frameworks, APIs, models, field names, architecture decisions, unfinished tasks, user preferences. - Add new facts merged in naturally, not appended as separate sentences. - No filler: no "ensuring", "enhances", "this setup", "this approach", "in order to", "it is worth noting". - No questions. No advice. No "you". No "I". - Dense technical paragraph. Maximum 8 sentences.""" # ========================= # FILLER PATTERNS # ========================= FILLER_PATTERNS = [ r"This (setup|approach|system|solution|architecture|method|design)\b[^.]*\.", r"ensuring\s[^.]*\.", r"while maintaining\s[^.]*\.", r"enhances\s[^.]*\.", r"This (ensures|allows|enables|provides|helps|makes|improves)\s[^.]*\.", r"for (better|improved|efficient|effective|optimal)\s[^.]*\.", r"in order to\s[^.]*\.", r"To (enhance|improve|ensure|enable)\s[^.]*\.", r"It is worth noting that\s[^.]*\.", r"Additionally,\s*(it|this)\s[^.]*\.", ] # ========================= # MEMORY LIMIT CONFIG # ========================= MEMORY_SOFT_LIMIT = 1600 # ~400 tokens — compress aggressively beyond this MEMORY_HARD_LIMIT = 2000 # ~500 tokens — absolute cap, never exceed # ========================= # HELPERS # ========================= def clean_assistant_message(text: str) -> str: """ Strip code blocks and inline code backticks from assistant responses. Model picks up function/class names from prose naturally. Cap at 800 chars to give model more context from long responses. """ # Remove full code blocks entirely text = re.sub(r"```[\w]*\n?.*?```", "", text, flags=re.DOTALL) # Remove inline code backticks but keep the text inside text = re.sub(r"`([^`]+)`", r"\1", text) # Collapse whitespace text = re.sub(r"\s{2,}", " ", text).strip() return text[:800] def enforce_memory_limit(text: str) -> str: """ Three-stage memory length enforcement. Stage 1 — Under 1600 chars (~400 tokens): Memory is healthy. Return as-is. Stage 2 — Between 1600 and 2000 chars (soft limit): Memory is getting long. Keep complete sentences that fit within 2000 chars. Oldest appended facts may be trimmed; core stack in early sentences is preserved. Stage 3 — Over 2000 chars (hard limit): Force trim to last complete sentence before 2000 chars. Never cuts mid-sentence. """ # Stage 1 — healthy if len(text) <= MEMORY_SOFT_LIMIT: return text # Stage 2 — soft limit: trim to complete sentences within hard limit if len(text) <= MEMORY_HARD_LIMIT: sentences = re.split(r"(?<=[.!?])\s+", text) result = "" for sentence in sentences: candidate = (result + " " + sentence).strip() if len(candidate) <= MEMORY_HARD_LIMIT: result = candidate else: break return result.strip() # Stage 3 — hard limit: force trim at last period before 2000 chars trimmed = text[:MEMORY_HARD_LIMIT] last_period = trimmed.rfind(".") if last_period != -1: trimmed = trimmed[:last_period + 1] return trimmed.strip() def strip_backticks(text: str) -> str: """Remove any backtick formatting that leaks into memory output.""" return re.sub(r"`([^`]+)`", r"\1", text) # ========================= # REQUEST MODEL # ========================= class SummaryRequest(BaseModel): old_memory: str = "" user_message: str assistant_message: str # ========================= # SUMMARY ENDPOINT # ========================= @app.post("/generate-summary") def generate_summary(req: SummaryRequest): old_memory = req.old_memory.strip() if req.old_memory.strip() else "(none)" user_message = req.user_message.strip() assistant_message = clean_assistant_message(req.assistant_message) user_content = f"""EXISTING MEMORY: {old_memory} USER SAID: {user_message} ASSISTANT REPLIED: {assistant_message} UPDATED MEMORY:""" messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_content}, ] # ========================= # FORMAT CHAT # ========================= text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer( text, return_tensors="pt" ).to(model.device) # ========================= # GENERATE # ========================= output = model.generate( **inputs, max_new_tokens=400, do_sample=False, repetition_penalty=1.15, eos_token_id=tokenizer.eos_token_id, ) # ========================= # DECODE # ========================= result = tokenizer.decode( output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True ).strip() # ========================= # CLEAN — stop phrases # ========================= stop_phrases = [ "<|im_end|>", "<|endoftext|>", "UPDATED MEMORY:", "EXISTING MEMORY:", "USER SAID:", "ASSISTANT REPLIED:", "STRICT RULES:", "EXAMPLE ", "Assistant:", "User:", ] for phrase in stop_phrases: if phrase in result: result = result.split(phrase)[0].strip() # ========================= # CLEAN — strip filler # ========================= for pattern in FILLER_PATTERNS: result = re.sub(pattern, "", result, flags=re.IGNORECASE) # ========================= # CLEAN — strip backticks # ========================= result = strip_backticks(result) # ========================= # CLEAN — deduplicate lines # ========================= seen, lines = set(), [] for line in result.splitlines(): line = line.strip() if line and line not in seen: seen.add(line) lines.append(line) result = " ".join(lines).strip() result = re.sub(r"\s{2,}", " ", result).strip() # ========================= # HARD MEMORY LENGTH CAP # ========================= result = enforce_memory_limit(result) return {"memory": result} # ========================= # HEALTH # ========================= @app.get("/") def root(): return { "status": "Memory Summarizer Running 🚀", "model": MODEL_ID, "device": device.upper(), "memory_soft_limit": f"{MEMORY_SOFT_LIMIT} chars (~400 tokens)", "memory_hard_limit": f"{MEMORY_HARD_LIMIT} chars (~500 tokens)", } # ========================= # RUN # ========================= if __name__ == "__main__": uvicorn.run("app:app", host="0.0.0.0", port=7860)