Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| import re | |
| import uvicorn | |
| # ========================= | |
| # APP | |
| # ========================= | |
| app = FastAPI() | |
| # ========================= | |
| # MODEL | |
| # ========================= | |
| MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" | |
| print(f"π Loading Memory Summarizer β {MODEL_ID}") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| dtype=torch.float16 if device == "cuda" else torch.float32, | |
| ) | |
| model = model.to(device) | |
| print(f"β Loaded on {device.upper()}") | |
| # ========================= | |
| # SYSTEM PROMPT | |
| # ========================= | |
| SYSTEM_PROMPT = """You are a memory compression engine. Compress and merge facts into dense paragraphs. | |
| EXAMPLE 1: | |
| EXISTING MEMORY: (none) | |
| USER SAID: I am building a weather app using React and OpenWeatherMap API. | |
| ASSISTANT REPLIED: Fetch data with axios. Store API key in .env via process.env. | |
| UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env via process.env. | |
| EXAMPLE 2: | |
| EXISTING MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env via process.env. | |
| USER SAID: How do I cache the weather data so I do not hit the API limit? | |
| ASSISTANT REPLIED: Use localStorage to cache responses with a timestamp. If cache is under 10 minutes old, return it instead of calling the API. | |
| UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key in .env. Responses cached in localStorage with 10-minute timestamp expiry to avoid API rate limit. | |
| EXAMPLE 3: | |
| EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location. | |
| USER SAID: How do job seekers apply for a job? | |
| ASSISTANT REPLIED: Create Application model with ForeignKey to Job and User, status field (applied, reviewed, rejected, accepted), resume FileField stored in S3. | |
| UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field (applied/reviewed/rejected/accepted), resume FileField stored in S3. | |
| EXAMPLE 4: | |
| EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field (applied/reviewed/rejected/accepted), resume FileField stored in S3. | |
| USER SAID: I want to add search and filters for title, location, and salary range. | |
| ASSISTANT REPLIED: Use Django Q objects and django-filter. Add query params title, location, salary_min, salary_max to job list endpoint. | |
| UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search and filtering via django-filter and Q objects on title, location, salary_min, salary_max query params. | |
| EXAMPLE 5: | |
| EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search and filtering via django-filter and Q objects on title, location, salary_min, salary_max query params. | |
| USER SAID: How do I notify applicants when their application status changes? | |
| ASSISTANT REPLIED: Use Django signals. On Application post_save, detect status change and trigger email via Celery async task using SendGrid. | |
| UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field (applied/reviewed/rejected/accepted) and S3 resume. Job search via django-filter and Q objects. Status change notifications triggered via Django signals on Application post_save, sending emails via Celery tasks and SendGrid. | |
| EXAMPLE 6: | |
| EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field (applied/reviewed/rejected/accepted) and S3 resume. Job search via django-filter and Q objects. Status change notifications triggered via Django signals on Application post_save, sending emails via Celery tasks and SendGrid. | |
| USER SAID: How do I deploy this on a VPS? | |
| ASSISTANT REPLIED: Docker Compose with services for Django, React, PostgreSQL, Redis, Celery. Serve Django via Gunicorn behind nginx. Certbot for SSL. Secrets in .env file. | |
| UPDATED MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx reverse proxy, Certbot SSL, secrets in .env. | |
| EXAMPLE 7: | |
| EXISTING MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx reverse proxy, Certbot SSL, secrets in .env. | |
| USER SAID: What is still left to build? | |
| ASSISTANT REPLIED: Admin panel for moderating posts, pagination on job listings, rate limiting on API, frontend loading states and error handling. | |
| UPDATED MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx, Certbot SSL. Pending: admin moderation panel, pagination, API rate limiting, frontend loading states and error handling. | |
| STRICT RULES: | |
| - Output ONLY the updated memory. No labels. No preamble. No explanation. | |
| - COMPRESS the existing memory. Do not copy it verbatim. Rewrite it shorter and denser. | |
| - Keep ALL technical facts: stack, frameworks, APIs, models, field names, architecture decisions, unfinished tasks, user preferences. | |
| - Add new facts merged in naturally, not appended as separate sentences. | |
| - No filler: no "ensuring", "enhances", "this setup", "this approach", "in order to", "it is worth noting". | |
| - No questions. No advice. No "you". No "I". | |
| - Dense technical paragraph. Maximum 8 sentences.""" | |
| # ========================= | |
| # FILLER PATTERNS | |
| # ========================= | |
| FILLER_PATTERNS = [ | |
| r"This (setup|approach|system|solution|architecture|method|design)\b[^.]*\.", | |
| r"ensuring\s[^.]*\.", | |
| r"while maintaining\s[^.]*\.", | |
| r"enhances\s[^.]*\.", | |
| r"This (ensures|allows|enables|provides|helps|makes|improves)\s[^.]*\.", | |
| r"for (better|improved|efficient|effective|optimal)\s[^.]*\.", | |
| r"in order to\s[^.]*\.", | |
| r"To (enhance|improve|ensure|enable)\s[^.]*\.", | |
| r"It is worth noting that\s[^.]*\.", | |
| r"Additionally,\s*(it|this)\s[^.]*\.", | |
| ] | |
| # ========================= | |
| # MEMORY LIMIT CONFIG | |
| # ========================= | |
| MEMORY_SOFT_LIMIT = 1600 # ~400 tokens β compress aggressively beyond this | |
| MEMORY_HARD_LIMIT = 2000 # ~500 tokens β absolute cap, never exceed | |
| # ========================= | |
| # HELPERS | |
| # ========================= | |
| def clean_assistant_message(text: str) -> str: | |
| """ | |
| Strip code blocks and inline code backticks from assistant responses. | |
| Model picks up function/class names from prose naturally. | |
| Cap at 800 chars to give model more context from long responses. | |
| """ | |
| # Remove full code blocks entirely | |
| text = re.sub(r"```[\w]*\n?.*?```", "", text, flags=re.DOTALL) | |
| # Remove inline code backticks but keep the text inside | |
| text = re.sub(r"`([^`]+)`", r"\1", text) | |
| # Collapse whitespace | |
| text = re.sub(r"\s{2,}", " ", text).strip() | |
| return text[:800] | |
| def enforce_memory_limit(text: str) -> str: | |
| """ | |
| Three-stage memory length enforcement. | |
| Stage 1 β Under 1600 chars (~400 tokens): | |
| Memory is healthy. Return as-is. | |
| Stage 2 β Between 1600 and 2000 chars (soft limit): | |
| Memory is getting long. Keep complete sentences | |
| that fit within 2000 chars. Oldest appended facts | |
| may be trimmed; core stack in early sentences is preserved. | |
| Stage 3 β Over 2000 chars (hard limit): | |
| Force trim to last complete sentence before 2000 chars. | |
| Never cuts mid-sentence. | |
| """ | |
| # Stage 1 β healthy | |
| if len(text) <= MEMORY_SOFT_LIMIT: | |
| return text | |
| # Stage 2 β soft limit: trim to complete sentences within hard limit | |
| if len(text) <= MEMORY_HARD_LIMIT: | |
| sentences = re.split(r"(?<=[.!?])\s+", text) | |
| result = "" | |
| for sentence in sentences: | |
| candidate = (result + " " + sentence).strip() | |
| if len(candidate) <= MEMORY_HARD_LIMIT: | |
| result = candidate | |
| else: | |
| break | |
| return result.strip() | |
| # Stage 3 β hard limit: force trim at last period before 2000 chars | |
| trimmed = text[:MEMORY_HARD_LIMIT] | |
| last_period = trimmed.rfind(".") | |
| if last_period != -1: | |
| trimmed = trimmed[:last_period + 1] | |
| return trimmed.strip() | |
| def strip_backticks(text: str) -> str: | |
| """Remove any backtick formatting that leaks into memory output.""" | |
| return re.sub(r"`([^`]+)`", r"\1", text) | |
| # ========================= | |
| # REQUEST MODEL | |
| # ========================= | |
| class SummaryRequest(BaseModel): | |
| old_memory: str = "" | |
| user_message: str | |
| assistant_message: str | |
| # ========================= | |
| # SUMMARY ENDPOINT | |
| # ========================= | |
| def generate_summary(req: SummaryRequest): | |
| old_memory = req.old_memory.strip() if req.old_memory.strip() else "(none)" | |
| user_message = req.user_message.strip() | |
| assistant_message = clean_assistant_message(req.assistant_message) | |
| user_content = f"""EXISTING MEMORY: {old_memory} | |
| USER SAID: {user_message} | |
| ASSISTANT REPLIED: {assistant_message} | |
| UPDATED MEMORY:""" | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_content}, | |
| ] | |
| # ========================= | |
| # FORMAT CHAT | |
| # ========================= | |
| text = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| inputs = tokenizer( | |
| text, | |
| return_tensors="pt" | |
| ).to(model.device) | |
| # ========================= | |
| # GENERATE | |
| # ========================= | |
| output = model.generate( | |
| **inputs, | |
| max_new_tokens=400, | |
| do_sample=False, | |
| repetition_penalty=1.15, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| # ========================= | |
| # DECODE | |
| # ========================= | |
| result = tokenizer.decode( | |
| output[0][inputs.input_ids.shape[1]:], | |
| skip_special_tokens=True | |
| ).strip() | |
| # ========================= | |
| # CLEAN β stop phrases | |
| # ========================= | |
| stop_phrases = [ | |
| "<|im_end|>", "<|endoftext|>", | |
| "UPDATED MEMORY:", "EXISTING MEMORY:", | |
| "USER SAID:", "ASSISTANT REPLIED:", | |
| "STRICT RULES:", "EXAMPLE ", | |
| "Assistant:", "User:", | |
| ] | |
| for phrase in stop_phrases: | |
| if phrase in result: | |
| result = result.split(phrase)[0].strip() | |
| # ========================= | |
| # CLEAN β strip filler | |
| # ========================= | |
| for pattern in FILLER_PATTERNS: | |
| result = re.sub(pattern, "", result, flags=re.IGNORECASE) | |
| # ========================= | |
| # CLEAN β strip backticks | |
| # ========================= | |
| result = strip_backticks(result) | |
| # ========================= | |
| # CLEAN β deduplicate lines | |
| # ========================= | |
| seen, lines = set(), [] | |
| for line in result.splitlines(): | |
| line = line.strip() | |
| if line and line not in seen: | |
| seen.add(line) | |
| lines.append(line) | |
| result = " ".join(lines).strip() | |
| result = re.sub(r"\s{2,}", " ", result).strip() | |
| # ========================= | |
| # HARD MEMORY LENGTH CAP | |
| # ========================= | |
| result = enforce_memory_limit(result) | |
| return {"memory": result} | |
| # ========================= | |
| # HEALTH | |
| # ========================= | |
| def root(): | |
| return { | |
| "status": "Memory Summarizer Running π", | |
| "model": MODEL_ID, | |
| "device": device.upper(), | |
| "memory_soft_limit": f"{MEMORY_SOFT_LIMIT} chars (~400 tokens)", | |
| "memory_hard_limit": f"{MEMORY_HARD_LIMIT} chars (~500 tokens)", | |
| } | |
| # ========================= | |
| # RUN | |
| # ========================= | |
| if __name__ == "__main__": | |
| uvicorn.run("app:app", host="0.0.0.0", port=7860) |