Spaces:

Valtry
/

summarizer

Sleeping

App Files Files Community

Valtry commited on 13 days ago

Commit

0d2f40b

verified ·

1 Parent(s): 756a711

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -90

app.py CHANGED Viewed

@@ -12,111 +12,84 @@ import uvicorn
 app = FastAPI()
 # =========================
-# MODEL CONFIG
 # =========================
-# Swap this to upgrade intelligence:
-# "Qwen/Qwen2.5-0.5B-Instruct"    → lightest, weakest
-# "Qwen/Qwen2.5-1.5B-Instruct"    → recommended sweet spot
-# "Qwen/Qwen2.5-3B-Instruct"      → best Qwen quality, tight on free tier
-# "HuggingFaceTB/SmolLM2-1.7B-Instruct" → good alternative
-# "meta-llama/Llama-3.2-1B-Instruct"    → good, needs HF token
-# "meta-llama/Llama-3.2-3B-Instruct"    → strong, needs HF token
-# "google/gemma-2-2b-it"                → solid, needs HF token
 MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
-# Models that need a HuggingFace token (set HF_TOKEN in Space secrets)
-GATED_MODELS = [
-    "meta-llama/Llama-3.2-1B-Instruct",
-    "meta-llama/Llama-3.2-3B-Instruct",
-    "google/gemma-2-2b-it",
-    "microsoft/Phi-3.5-mini-instruct",
-]
 print(f"🚀 Loading Memory Summarizer — {MODEL_ID}")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-import os
-hf_token = os.environ.get("HF_TOKEN", None)
-use_token = hf_token if any(m in MODEL_ID for m in GATED_MODELS) else None
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=use_token)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-    device_map="auto",
-    token=use_token
 )
-print(f"✅ Loaded {MODEL_ID} on {device.upper()}")
 # =========================
 # SYSTEM PROMPT
 # =========================
-SYSTEM_PROMPT = """You are a memory compression engine. Your only job is to merge facts.
 EXAMPLE 1:
 EXISTING MEMORY: (none)
-USER SAID: I am building a chat app with Node.js and MongoDB.
-ASSISTANT REPLIED: Use Socket.io rooms. Store messages with roomId and timestamp.
-UPDATED MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp.
 EXAMPLE 2:
-EXISTING MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp.
-USER SAID: How do I add authentication?
-ASSISTANT REPLIED: Use JWT. Verify token on every Socket.io connection via middleware.
-UPDATED MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp. Auth via JWT verified on Socket.io connections through middleware.
 EXAMPLE 3:
-EXISTING MEMORY: User building REST API with FastAPI and PostgreSQL. JWT auth implemented.
-USER SAID: How do I add rate limiting?
-ASSISTANT REPLIED: Use slowapi. Attach limiter to app and decorate routes.
-UPDATED MEMORY: User building REST API with FastAPI and PostgreSQL. JWT auth implemented. Rate limiting via slowapi on routes.
 EXAMPLE 4:
-EXISTING MEMORY: User building SaaS dashboard with Next.js and FastAPI. PostgreSQL for database.
-USER SAID: Should I use REST or GraphQL?
-ASSISTANT REPLIED: Use REST for fixed data shapes. GraphQL for flexible querying.
-UPDATED MEMORY: User building SaaS dashboard with Next.js and FastAPI using PostgreSQL. Chose REST over GraphQL due to fixed data shapes.
 EXAMPLE 5:
-EXISTING MEMORY: User building Python scraper with BeautifulSoup. Stores results in CSV.
-USER SAID: My scraper gets blocked after 50 requests.
-ASSISTANT REPLIED: Add random delays, rotate user-agent headers, use proxy pool.
-UPDATED MEMORY: User building Python scraper with BeautifulSoup storing results in CSV. Anti-blocking via random delays, rotating user-agent headers, and proxy pool.
 EXAMPLE 6:
-EXISTING MEMORY: User building mobile app in React Native with Firebase.
-USER SAID: I am switching from Firebase to Supabase.
-ASSISTANT REPLIED: Replace Firebase Auth with Supabase Auth. Replace Firestore with Supabase PostgreSQL.
-UPDATED MEMORY: User building mobile app in React Native. Switched from Firebase to Supabase. Auth via Supabase Auth, database via Supabase PostgreSQL.
 EXAMPLE 7:
-EXISTING MEMORY: User building e-commerce site with Django and Stripe. Cart and product pages done. Checkout pending.
-USER SAID: How do I send order confirmation emails?
-ASSISTANT REPLIED: Use Django send_mail or SendGrid. Trigger inside Stripe webhook on payment_intent.succeeded.
-UPDATED MEMORY: User building e-commerce site with Django and Stripe. Cart and product pages done. Checkout pending. Order confirmation emails via SendGrid triggered on payment_intent.succeeded inside Stripe webhook.
-EXAMPLE 8:
-EXISTING MEMORY: User building local AI assistant with FastAPI and llama.cpp. Supports streaming and branching conversations.
-USER SAID: I want to add long-term memory to avoid token limit issues.
-ASSISTANT REPLIED: Use Qwen2.5-0.5B to recursively summarize memory. Store in Supabase. Inject before recent chat history. Truncate large responses before summarizing.
-UPDATED MEMORY: User building local AI assistant with FastAPI and llama.cpp. Supports streaming and branching conversations. Long-term memory via Qwen2.5-0.5B recursive summarization stored in Supabase, injected before recent history. Large responses truncated before summarizing.
 STRICT RULES:
 - Output ONLY the updated memory. No labels. No preamble. No explanation.
-- Keep ALL facts from EXISTING MEMORY unless directly contradicted.
-- Add only new facts from USER SAID and ASSISTANT REPLIED.
-- No filler: no "ensuring", "enhances", "maintaining", "this setup", "this approach".
 - No questions. No advice. No "you". No "I".
-- One short dense paragraph only."""
 # =========================
-# FILLER PHRASES TO STRIP
 # =========================
 FILLER_PATTERNS = [
@@ -126,8 +99,66 @@ FILLER_PATTERNS = [
     r"enhances\s[^.]*\.",
     r"This (ensures|allows|enables|provides|helps|makes|improves)\s[^.]*\.",
     r"for (better|improved|efficient|effective|optimal)\s[^.]*\.",
 ]
 # =========================
 # REQUEST MODEL
 # =========================
@@ -146,7 +177,7 @@ def generate_summary(req: SummaryRequest):
     old_memory        = req.old_memory.strip() if req.old_memory.strip() else "(none)"
     user_message      = req.user_message.strip()
-    assistant_message = req.assistant_message.strip()[:500]
     user_content = f"""EXISTING MEMORY: {old_memory}
 USER SAID: {user_message}
@@ -158,10 +189,6 @@ UPDATED MEMORY:"""
         {"role": "user",   "content": user_content},
     ]
-    # =========================
-    # FORMAT CHAT
-    # =========================
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
@@ -173,22 +200,14 @@ UPDATED MEMORY:"""
         return_tensors="pt"
     ).to(model.device)
-    # =========================
-    # GENERATE
-    # =========================
     output = model.generate(
         **inputs,
-        max_new_tokens=200,
         do_sample=False,
         repetition_penalty=1.15,
         eos_token_id=tokenizer.eos_token_id,
     )
-    # =========================
-    # DECODE
-    # =========================
     result = tokenizer.decode(
         output[0][inputs.input_ids.shape[1]:],
         skip_special_tokens=True
@@ -230,12 +249,13 @@ UPDATED MEMORY:"""
             lines.append(line)
     result = " ".join(lines).strip()
     # =========================
-    # CLEAN — fix double spaces
     # =========================
-    result = re.sub(r"\s{2,}", " ", result).strip()
     return {"memory": result}
@@ -251,13 +271,5 @@ def root():
         "device": device.upper()
     }
-# =========================
-# RUN
-# =========================
 if __name__ == "__main__":
-    uvicorn.run(
-        "app:app",
-        host="0.0.0.0",
-        port=7860
-    )

 app = FastAPI()
 # =========================
+# MODEL
 # =========================
 MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
 print(f"🚀 Loading Memory Summarizer — {MODEL_ID}")
 device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+    device_map="auto"
 )
+print(f"✅ Loaded on {device.upper()}")
 # =========================
 # SYSTEM PROMPT
 # =========================
+SYSTEM_PROMPT = """You are a memory compression engine. Compress and merge facts into one short dense paragraph.
 EXAMPLE 1:
 EXISTING MEMORY: (none)
+USER SAID: I am building a weather app using React and OpenWeatherMap API.
+ASSISTANT REPLIED: Fetch data with axios. Store API key in .env via process.env.
+UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env.
 EXAMPLE 2:
+EXISTING MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env.
+USER SAID: How do I cache the weather data so I do not hit the API limit?
+ASSISTANT REPLIED: Use localStorage to cache responses with a timestamp. If cache is under 10 minutes old, return it instead of calling the API.
+UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios, cached in localStorage with 10-minute expiry to avoid API limit.
 EXAMPLE 3:
+EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location.
+USER SAID: How do job seekers apply for a job?
+ASSISTANT REPLIED: Create Application model with ForeignKey to Job and User, status field, resume FileField in S3.
+UPDATED MEMORY: User building job board with Django, React, PostgreSQL, JWT auth. Custom user model with company/jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field, resume stored in S3.
 EXAMPLE 4:
+EXISTING MEMORY: User building job board with Django, React, PostgreSQL, JWT auth. Custom user model with company/jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field, resume stored in S3.
+USER SAID: I want to add search and filters for title, location, and salary range.
+ASSISTANT REPLIED: Use Django Q objects and django-filter. Add query params to job list endpoint.
+UPDATED MEMORY: User building job board with Django, React, PostgreSQL, JWT auth. Company/jobseeker roles. Job and Application models complete with S3 resumes. Job search via django-filter and Q objects on title, location, salary range.
 EXAMPLE 5:
+EXISTING MEMORY: User building job board with Django, React, PostgreSQL, JWT auth. Company/jobseeker roles. Job and Application models complete with S3 resumes. Job search via django-filter and Q objects on title, location, salary range.
+USER SAID: How do I notify applicants when status changes?
+ASSISTANT REPLIED: Use Django signals on Application post_save. Trigger SendGrid email via Celery async task.
+UPDATED MEMORY: User building job board with Django, React, PostgreSQL, JWT auth, Celery, SendGrid. Company/jobseeker roles. Job and Application models with S3 resumes and django-filter search. Status change notifications via Django signals and Celery tasks.
 EXAMPLE 6:
+EXISTING MEMORY: User building job board with Django, React, PostgreSQL, JWT auth, Celery, SendGrid. Company/jobseeker roles. Job and Application models with S3 resumes and django-filter search. Status change notifications via Django signals and Celery tasks.
+USER SAID: How do I deploy this on a VPS?
+ASSISTANT REPLIED: Docker Compose with Django, React, PostgreSQL, Redis, Celery services. Gunicorn behind nginx. Certbot for SSL.
+UPDATED MEMORY: User building job board with Django, React, PostgreSQL, JWT auth, Celery, SendGrid, Redis. Company/jobseeker roles. Job and Application models with S3 resumes and django-filter search. Status notifications via Django signals. Deployed via Docker Compose with Gunicorn, nginx, Certbot SSL.
 EXAMPLE 7:
+EXISTING MEMORY: User building job board with Django, React, PostgreSQL, JWT auth, Celery, SendGrid, Redis. Company/jobseeker roles. Job and Application models with S3 resumes and django-filter search. Status notifications via Django signals. Deployed via Docker Compose with Gunicorn, nginx, Certbot SSL.
+USER SAID: What is still left to build?
+ASSISTANT REPLIED: Admin panel, pagination, rate limiting, frontend loading states and error handling.
+UPDATED MEMORY: User building job board with Django, React, PostgreSQL, JWT auth, Celery, SendGrid, Redis. Company/jobseeker roles. Job, Application models with S3 resumes, django-filter search, Docker Compose deployment. Pending: admin panel, pagination, rate limiting, frontend loading states and error handling.
 STRICT RULES:
 - Output ONLY the updated memory. No labels. No preamble. No explanation.
+- COMPRESS the existing memory. Do not copy it verbatim. Rewrite it shorter.
+- Keep ALL technical facts. Remove only filler words.
+- Add new facts merged in, not appended as separate sentences.
+- No filler: no "ensuring", "enhances", "this setup", "this approach", "in order to".
 - No questions. No advice. No "you". No "I".
+- One short dense paragraph. Maximum 3 sentences."""
 # =========================
+# FILLER PATTERNS
 # =========================
 FILLER_PATTERNS = [
     r"enhances\s[^.]*\.",
     r"This (ensures|allows|enables|provides|helps|makes|improves)\s[^.]*\.",
     r"for (better|improved|efficient|effective|optimal)\s[^.]*\.",
+    r"in order to\s[^.]*\.",
+    r"To (enhance|improve|ensure|enable)\s[^.]*\.",
 ]
+# =========================
+# HELPERS
+# =========================
+def clean_assistant_message(text: str) -> str:
+    """
+    Strip code blocks from assistant responses.
+    Extract function/class names and key terms before removing.
+    Keep only prose explanation, cap at 500 chars.
+    """
+    # Extract key identifiers from code before removing
+    code_blocks = re.findall(r"```[\w]*\n?(.*?)```", text, re.DOTALL)
+    extracted_terms = []
+    for block in code_blocks:
+        # Grab function/class/variable names
+        names = re.findall(
+            r"(?:def|class|const|let|var|function)\s+(\w+)", block
+        )
+        extracted_terms.extend(names)
+    # Remove code blocks
+    text = re.sub(r"```[\w]*\n?.*?```", "", text, flags=re.DOTALL)
+    # Remove inline code but keep the text
+    text = re.sub(r"`([^`]+)`", r"\1", text)
+    # Append extracted key names if any
+    if extracted_terms:
+        text += " Key identifiers: " + ", ".join(extracted_terms) + "."
+    # Collapse whitespace
+    text = re.sub(r"\s{2,}", " ", text).strip()
+    return text[:500]
+def enforce_memory_limit(text: str, max_chars: int = 600) -> str:
+    """
+    Hard cap on memory length.
+    If over limit, keep complete sentences up to the limit.
+    """
+    if len(text) <= max_chars:
+        return text
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    result = ""
+    for sentence in sentences:
+        if len(result) + len(sentence) + 1 <= max_chars:
+            result += ("" if not result else " ") + sentence
+        else:
+            break
+    return result.strip()
 # =========================
 # REQUEST MODEL
 # =========================
     old_memory        = req.old_memory.strip() if req.old_memory.strip() else "(none)"
     user_message      = req.user_message.strip()
+    assistant_message = clean_assistant_message(req.assistant_message)
     user_content = f"""EXISTING MEMORY: {old_memory}
 USER SAID: {user_message}
         {"role": "user",   "content": user_content},
     ]
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         return_tensors="pt"
     ).to(model.device)
     output = model.generate(
         **inputs,
+        max_new_tokens=220,
         do_sample=False,
         repetition_penalty=1.15,
         eos_token_id=tokenizer.eos_token_id,
     )
     result = tokenizer.decode(
         output[0][inputs.input_ids.shape[1]:],
         skip_special_tokens=True
             lines.append(line)
     result = " ".join(lines).strip()
+    result = re.sub(r"\s{2,}", " ", result).strip()
     # =========================
+    # HARD MEMORY LENGTH CAP
     # =========================
+    result = enforce_memory_limit(result, max_chars=600)
     return {"memory": result}
         "device": device.upper()
     }
 if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=7860)