summarizer / app.py
Valtry's picture
Update app.py
6fd963b verified
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re
import uvicorn
# =========================
# APP
# =========================
app = FastAPI()
# =========================
# MODEL
# =========================
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
print(f"πŸš€ Loading Memory Summarizer β€” {MODEL_ID}")
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
dtype=torch.float16 if device == "cuda" else torch.float32,
)
model = model.to(device)
print(f"βœ… Loaded on {device.upper()}")
# =========================
# SYSTEM PROMPT
# =========================
SYSTEM_PROMPT = """You are a memory compression engine. Compress and merge facts into dense paragraphs.
EXAMPLE 1:
EXISTING MEMORY: (none)
USER SAID: I am building a weather app using React and OpenWeatherMap API.
ASSISTANT REPLIED: Fetch data with axios. Store API key in .env via process.env.
UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env via process.env.
EXAMPLE 2:
EXISTING MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env via process.env.
USER SAID: How do I cache the weather data so I do not hit the API limit?
ASSISTANT REPLIED: Use localStorage to cache responses with a timestamp. If cache is under 10 minutes old, return it instead of calling the API.
UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key in .env. Responses cached in localStorage with 10-minute timestamp expiry to avoid API rate limit.
EXAMPLE 3:
EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location.
USER SAID: How do job seekers apply for a job?
ASSISTANT REPLIED: Create Application model with ForeignKey to Job and User, status field (applied, reviewed, rejected, accepted), resume FileField stored in S3.
UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field (applied/reviewed/rejected/accepted), resume FileField stored in S3.
EXAMPLE 4:
EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field (applied/reviewed/rejected/accepted), resume FileField stored in S3.
USER SAID: I want to add search and filters for title, location, and salary range.
ASSISTANT REPLIED: Use Django Q objects and django-filter. Add query params title, location, salary_min, salary_max to job list endpoint.
UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search and filtering via django-filter and Q objects on title, location, salary_min, salary_max query params.
EXAMPLE 5:
EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search and filtering via django-filter and Q objects on title, location, salary_min, salary_max query params.
USER SAID: How do I notify applicants when their application status changes?
ASSISTANT REPLIED: Use Django signals. On Application post_save, detect status change and trigger email via Celery async task using SendGrid.
UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field (applied/reviewed/rejected/accepted) and S3 resume. Job search via django-filter and Q objects. Status change notifications triggered via Django signals on Application post_save, sending emails via Celery tasks and SendGrid.
EXAMPLE 6:
EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field (applied/reviewed/rejected/accepted) and S3 resume. Job search via django-filter and Q objects. Status change notifications triggered via Django signals on Application post_save, sending emails via Celery tasks and SendGrid.
USER SAID: How do I deploy this on a VPS?
ASSISTANT REPLIED: Docker Compose with services for Django, React, PostgreSQL, Redis, Celery. Serve Django via Gunicorn behind nginx. Certbot for SSL. Secrets in .env file.
UPDATED MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx reverse proxy, Certbot SSL, secrets in .env.
EXAMPLE 7:
EXISTING MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx reverse proxy, Certbot SSL, secrets in .env.
USER SAID: What is still left to build?
ASSISTANT REPLIED: Admin panel for moderating posts, pagination on job listings, rate limiting on API, frontend loading states and error handling.
UPDATED MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx, Certbot SSL. Pending: admin moderation panel, pagination, API rate limiting, frontend loading states and error handling.
STRICT RULES:
- Output ONLY the updated memory. No labels. No preamble. No explanation.
- COMPRESS the existing memory. Do not copy it verbatim. Rewrite it shorter and denser.
- Keep ALL technical facts: stack, frameworks, APIs, models, field names, architecture decisions, unfinished tasks, user preferences.
- Add new facts merged in naturally, not appended as separate sentences.
- No filler: no "ensuring", "enhances", "this setup", "this approach", "in order to", "it is worth noting".
- No questions. No advice. No "you". No "I".
- Dense technical paragraph. Maximum 8 sentences."""
# =========================
# FILLER PATTERNS
# =========================
FILLER_PATTERNS = [
r"This (setup|approach|system|solution|architecture|method|design)\b[^.]*\.",
r"ensuring\s[^.]*\.",
r"while maintaining\s[^.]*\.",
r"enhances\s[^.]*\.",
r"This (ensures|allows|enables|provides|helps|makes|improves)\s[^.]*\.",
r"for (better|improved|efficient|effective|optimal)\s[^.]*\.",
r"in order to\s[^.]*\.",
r"To (enhance|improve|ensure|enable)\s[^.]*\.",
r"It is worth noting that\s[^.]*\.",
r"Additionally,\s*(it|this)\s[^.]*\.",
]
# =========================
# MEMORY LIMIT CONFIG
# =========================
MEMORY_SOFT_LIMIT = 1600 # ~400 tokens β€” compress aggressively beyond this
MEMORY_HARD_LIMIT = 2000 # ~500 tokens β€” absolute cap, never exceed
# =========================
# HELPERS
# =========================
def clean_assistant_message(text: str) -> str:
"""
Strip code blocks and inline code backticks from assistant responses.
Model picks up function/class names from prose naturally.
Cap at 800 chars to give model more context from long responses.
"""
# Remove full code blocks entirely
text = re.sub(r"```[\w]*\n?.*?```", "", text, flags=re.DOTALL)
# Remove inline code backticks but keep the text inside
text = re.sub(r"`([^`]+)`", r"\1", text)
# Collapse whitespace
text = re.sub(r"\s{2,}", " ", text).strip()
return text[:800]
def enforce_memory_limit(text: str) -> str:
"""
Three-stage memory length enforcement.
Stage 1 β€” Under 1600 chars (~400 tokens):
Memory is healthy. Return as-is.
Stage 2 β€” Between 1600 and 2000 chars (soft limit):
Memory is getting long. Keep complete sentences
that fit within 2000 chars. Oldest appended facts
may be trimmed; core stack in early sentences is preserved.
Stage 3 β€” Over 2000 chars (hard limit):
Force trim to last complete sentence before 2000 chars.
Never cuts mid-sentence.
"""
# Stage 1 β€” healthy
if len(text) <= MEMORY_SOFT_LIMIT:
return text
# Stage 2 β€” soft limit: trim to complete sentences within hard limit
if len(text) <= MEMORY_HARD_LIMIT:
sentences = re.split(r"(?<=[.!?])\s+", text)
result = ""
for sentence in sentences:
candidate = (result + " " + sentence).strip()
if len(candidate) <= MEMORY_HARD_LIMIT:
result = candidate
else:
break
return result.strip()
# Stage 3 β€” hard limit: force trim at last period before 2000 chars
trimmed = text[:MEMORY_HARD_LIMIT]
last_period = trimmed.rfind(".")
if last_period != -1:
trimmed = trimmed[:last_period + 1]
return trimmed.strip()
def strip_backticks(text: str) -> str:
"""Remove any backtick formatting that leaks into memory output."""
return re.sub(r"`([^`]+)`", r"\1", text)
# =========================
# REQUEST MODEL
# =========================
class SummaryRequest(BaseModel):
old_memory: str = ""
user_message: str
assistant_message: str
# =========================
# SUMMARY ENDPOINT
# =========================
@app.post("/generate-summary")
def generate_summary(req: SummaryRequest):
old_memory = req.old_memory.strip() if req.old_memory.strip() else "(none)"
user_message = req.user_message.strip()
assistant_message = clean_assistant_message(req.assistant_message)
user_content = f"""EXISTING MEMORY: {old_memory}
USER SAID: {user_message}
ASSISTANT REPLIED: {assistant_message}
UPDATED MEMORY:"""
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_content},
]
# =========================
# FORMAT CHAT
# =========================
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer(
text,
return_tensors="pt"
).to(model.device)
# =========================
# GENERATE
# =========================
output = model.generate(
**inputs,
max_new_tokens=400,
do_sample=False,
repetition_penalty=1.15,
eos_token_id=tokenizer.eos_token_id,
)
# =========================
# DECODE
# =========================
result = tokenizer.decode(
output[0][inputs.input_ids.shape[1]:],
skip_special_tokens=True
).strip()
# =========================
# CLEAN β€” stop phrases
# =========================
stop_phrases = [
"<|im_end|>", "<|endoftext|>",
"UPDATED MEMORY:", "EXISTING MEMORY:",
"USER SAID:", "ASSISTANT REPLIED:",
"STRICT RULES:", "EXAMPLE ",
"Assistant:", "User:",
]
for phrase in stop_phrases:
if phrase in result:
result = result.split(phrase)[0].strip()
# =========================
# CLEAN β€” strip filler
# =========================
for pattern in FILLER_PATTERNS:
result = re.sub(pattern, "", result, flags=re.IGNORECASE)
# =========================
# CLEAN β€” strip backticks
# =========================
result = strip_backticks(result)
# =========================
# CLEAN β€” deduplicate lines
# =========================
seen, lines = set(), []
for line in result.splitlines():
line = line.strip()
if line and line not in seen:
seen.add(line)
lines.append(line)
result = " ".join(lines).strip()
result = re.sub(r"\s{2,}", " ", result).strip()
# =========================
# HARD MEMORY LENGTH CAP
# =========================
result = enforce_memory_limit(result)
return {"memory": result}
# =========================
# HEALTH
# =========================
@app.get("/")
def root():
return {
"status": "Memory Summarizer Running πŸš€",
"model": MODEL_ID,
"device": device.upper(),
"memory_soft_limit": f"{MEMORY_SOFT_LIMIT} chars (~400 tokens)",
"memory_hard_limit": f"{MEMORY_HARD_LIMIT} chars (~500 tokens)",
}
# =========================
# RUN
# =========================
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=7860)