Spaces:

Valtry
/

summarizer

Sleeping

App Files Files Community

Valtry commited on about 1 month ago

Commit

756a711

verified ·

1 Parent(s): 52b7f3e

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -62

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ from fastapi import FastAPI
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import uvicorn
 # =========================
@@ -11,86 +12,121 @@ import uvicorn
 app = FastAPI()
 # =========================
-# MODEL
 # =========================
-MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
-print("🚀 Loading Memory Summarizer...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-    device_map="auto"
 )
-print(f"✅ Loaded on {device.upper()}")
 # =========================
 # SYSTEM PROMPT
 # =========================
-SYSTEM_PROMPT = """You are a memory compression engine.
 EXAMPLE 1:
-EXISTING MEMORY: User building a todo app with React and Firebase.
-USER SAID: Can I add offline support?
-ASSISTANT REPLIED: Use Firebase offline persistence by enabling it in the SDK config.
-UPDATED MEMORY: User building todo app with React and Firebase. Offline persistence enabled via Firebase SDK config.
 EXAMPLE 2:
-EXISTING MEMORY: User building REST API with FastAPI and PostgreSQL. Using JWT for auth.
-USER SAID: How do I add rate limiting?
-ASSISTANT REPLIED: Use slowapi library with FastAPI. Attach the limiter to the app instance and decorate routes.
-UPDATED MEMORY: User building REST API with FastAPI and PostgreSQL using JWT auth and slowapi-based rate limiting on routes.
 EXAMPLE 3:
-EXISTING MEMORY: User building CLI tool in Python to rename files in bulk. Uses argparse and pathlib.
-USER SAID: I want to add a dry-run mode that shows changes without applying them.
-ASSISTANT REPLIED: Add a --dry-run flag via argparse. When set, print the rename operations instead of executing them.
-UPDATED MEMORY: User building Python CLI bulk rename tool using argparse and pathlib. Supports dry-run mode via --dry-run flag that prints operations without executing.
 EXAMPLE 4:
-EXISTING MEMORY: (none)
-USER SAID: I am building an e-commerce backend with Django and Stripe for payments.
-ASSISTANT REPLIED: Use stripe-python SDK directly. Handle webhooks via a dedicated endpoint with signature verification.
-UPDATED MEMORY: User building e-commerce backend with Django and Stripe. Payments via stripe-python SDK with webhook endpoint using signature verification.
 EXAMPLE 5:
-EXISTING MEMORY: User building AI chatbot with FastAPI and Supabase for storage. Supports streaming responses.
-USER SAID: How do I add conversation branching?
-ASSISTANT REPLIED: Store a parent_message_id on each message in Supabase. Query by branch to reconstruct any conversation path.
-UPDATED MEMORY: User building AI chatbot with FastAPI and Supabase. Supports streaming responses and conversation branching via parent_message_id stored per message.
 EXAMPLE 6:
-EXISTING MEMORY: User building image processing pipeline in Python using OpenCV. Handles batch resizing.
-USER SAID: I want to add face detection now.
-ASSISTANT REPLIED: Use OpenCV Haar cascades or switch to mediapipe for better accuracy on varied lighting.
-UPDATED MEMORY: User building image processing pipeline in Python using OpenCV. Handles batch resizing and face detection via Haar cascades or mediapipe for varied lighting.
 EXAMPLE 7:
-EXISTING MEMORY: (none)
-USER SAID: I want to build a habit tracker mobile app using Flutter and SQLite.
-ASSISTANT REPLIED: Use sqflite package for SQLite in Flutter. Store habits and daily completion records in separate tables.
-UPDATED MEMORY: User building Flutter habit tracker app using sqflite for SQLite storage. Habits and daily completion records stored in separate tables.
 EXAMPLE 8:
-EXISTING MEMORY: User building a portfolio website with Next.js and Tailwind. Deployed on Vercel.
-USER SAID: How do I add a blog section with markdown support?
-ASSISTANT REPLIED: Use next-mdx-remote to parse and render markdown files. Store posts as .mdx files in a /content folder.
-UPDATED MEMORY: User building portfolio website with Next.js and Tailwind deployed on Vercel. Blog section added using next-mdx-remote with .mdx files stored in /content folder.
-Now do the same task.
-Rules:
-- Merge EXISTING MEMORY with the new conversation into one updated memory.
-- Preserve all technical details: stack, frameworks, APIs, models, tools, databases, architecture decisions, unfinished tasks, user preferences.
-- Never drop existing memory facts unless directly contradicted by new information.
-- Write in third-person. No "you". No "I".
-- Output ONLY the updated memory. No labels. No explanation. No bullet points. No extra text."""
 # =========================
 # REQUEST MODEL
@@ -108,9 +144,9 @@ class SummaryRequest(BaseModel):
 @app.post("/generate-summary")
 def generate_summary(req: SummaryRequest):
-    old_memory = req.old_memory.strip() if req.old_memory.strip() else "(none)"
-    user_message = req.user_message.strip()
-    assistant_message = req.assistant_message.strip()[:600]
     user_content = f"""EXISTING MEMORY: {old_memory}
 USER SAID: {user_message}
@@ -159,27 +195,32 @@ UPDATED MEMORY:"""
     ).strip()
     # =========================
-    # CLEAN OUTPUT
     # =========================
     stop_phrases = [
-        "<|im_end|>",
-        "<|endoftext|>",
-        "UPDATED MEMORY:",
-        "EXISTING MEMORY:",
-        "USER SAID:",
-        "ASSISTANT REPLIED:",
-        "EXAMPLE ",
-        "Now do the same",
-        "Assistant:",
-        "User:",
     ]
     for phrase in stop_phrases:
         if phrase in result:
             result = result.split(phrase)[0].strip()
-    # Deduplicate lines
     seen, lines = set(), []
     for line in result.splitlines():
@@ -190,6 +231,12 @@ UPDATED MEMORY:"""
     result = " ".join(lines).strip()
     return {"memory": result}
 # =========================
@@ -198,7 +245,11 @@ UPDATED MEMORY:"""
 @app.get("/")
 def root():
-    return {"status": "Memory Summarizer Running 🚀"}
 # =========================
 # RUN

 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+import re
 import uvicorn
 # =========================
 app = FastAPI()
 # =========================
+# MODEL CONFIG
 # =========================
+# Swap this to upgrade intelligence:
+# "Qwen/Qwen2.5-0.5B-Instruct"    → lightest, weakest
+# "Qwen/Qwen2.5-1.5B-Instruct"    → recommended sweet spot
+# "Qwen/Qwen2.5-3B-Instruct"      → best Qwen quality, tight on free tier
+# "HuggingFaceTB/SmolLM2-1.7B-Instruct" → good alternative
+# "meta-llama/Llama-3.2-1B-Instruct"    → good, needs HF token
+# "meta-llama/Llama-3.2-3B-Instruct"    → strong, needs HF token
+# "google/gemma-2-2b-it"                → solid, needs HF token
+MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
+# Models that need a HuggingFace token (set HF_TOKEN in Space secrets)
+GATED_MODELS = [
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "meta-llama/Llama-3.2-3B-Instruct",
+    "google/gemma-2-2b-it",
+    "microsoft/Phi-3.5-mini-instruct",
+]
+print(f"🚀 Loading Memory Summarizer — {MODEL_ID}")
 device = "cuda" if torch.cuda.is_available() else "cpu"
+import os
+hf_token = os.environ.get("HF_TOKEN", None)
+use_token = hf_token if any(m in MODEL_ID for m in GATED_MODELS) else None
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=use_token)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+    device_map="auto",
+    token=use_token
 )
+print(f"✅ Loaded {MODEL_ID} on {device.upper()}")
 # =========================
 # SYSTEM PROMPT
 # =========================
+SYSTEM_PROMPT = """You are a memory compression engine. Your only job is to merge facts.
 EXAMPLE 1:
+EXISTING MEMORY: (none)
+USER SAID: I am building a chat app with Node.js and MongoDB.
+ASSISTANT REPLIED: Use Socket.io rooms. Store messages with roomId and timestamp.
+UPDATED MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp.
 EXAMPLE 2:
+EXISTING MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp.
+USER SAID: How do I add authentication?
+ASSISTANT REPLIED: Use JWT. Verify token on every Socket.io connection via middleware.
+UPDATED MEMORY: User building chat app with Node.js, MongoDB, and Socket.io. Messages stored with roomId and timestamp. Auth via JWT verified on Socket.io connections through middleware.
 EXAMPLE 3:
+EXISTING MEMORY: User building REST API with FastAPI and PostgreSQL. JWT auth implemented.
+USER SAID: How do I add rate limiting?
+ASSISTANT REPLIED: Use slowapi. Attach limiter to app and decorate routes.
+UPDATED MEMORY: User building REST API with FastAPI and PostgreSQL. JWT auth implemented. Rate limiting via slowapi on routes.
 EXAMPLE 4:
+EXISTING MEMORY: User building SaaS dashboard with Next.js and FastAPI. PostgreSQL for database.
+USER SAID: Should I use REST or GraphQL?
+ASSISTANT REPLIED: Use REST for fixed data shapes. GraphQL for flexible querying.
+UPDATED MEMORY: User building SaaS dashboard with Next.js and FastAPI using PostgreSQL. Chose REST over GraphQL due to fixed data shapes.
 EXAMPLE 5:
+EXISTING MEMORY: User building Python scraper with BeautifulSoup. Stores results in CSV.
+USER SAID: My scraper gets blocked after 50 requests.
+ASSISTANT REPLIED: Add random delays, rotate user-agent headers, use proxy pool.
+UPDATED MEMORY: User building Python scraper with BeautifulSoup storing results in CSV. Anti-blocking via random delays, rotating user-agent headers, and proxy pool.
 EXAMPLE 6:
+EXISTING MEMORY: User building mobile app in React Native with Firebase.
+USER SAID: I am switching from Firebase to Supabase.
+ASSISTANT REPLIED: Replace Firebase Auth with Supabase Auth. Replace Firestore with Supabase PostgreSQL.
+UPDATED MEMORY: User building mobile app in React Native. Switched from Firebase to Supabase. Auth via Supabase Auth, database via Supabase PostgreSQL.
 EXAMPLE 7:
+EXISTING MEMORY: User building e-commerce site with Django and Stripe. Cart and product pages done. Checkout pending.
+USER SAID: How do I send order confirmation emails?
+ASSISTANT REPLIED: Use Django send_mail or SendGrid. Trigger inside Stripe webhook on payment_intent.succeeded.
+UPDATED MEMORY: User building e-commerce site with Django and Stripe. Cart and product pages done. Checkout pending. Order confirmation emails via SendGrid triggered on payment_intent.succeeded inside Stripe webhook.
 EXAMPLE 8:
+EXISTING MEMORY: User building local AI assistant with FastAPI and llama.cpp. Supports streaming and branching conversations.
+USER SAID: I want to add long-term memory to avoid token limit issues.
+ASSISTANT REPLIED: Use Qwen2.5-0.5B to recursively summarize memory. Store in Supabase. Inject before recent chat history. Truncate large responses before summarizing.
+UPDATED MEMORY: User building local AI assistant with FastAPI and llama.cpp. Supports streaming and branching conversations. Long-term memory via Qwen2.5-0.5B recursive summarization stored in Supabase, injected before recent history. Large responses truncated before summarizing.
+STRICT RULES:
+- Output ONLY the updated memory. No labels. No preamble. No explanation.
+- Keep ALL facts from EXISTING MEMORY unless directly contradicted.
+- Add only new facts from USER SAID and ASSISTANT REPLIED.
+- No filler: no "ensuring", "enhances", "maintaining", "this setup", "this approach".
+- No questions. No advice. No "you". No "I".
+- One short dense paragraph only."""
+# =========================
+# FILLER PHRASES TO STRIP
+# =========================
+FILLER_PATTERNS = [
+    r"This (setup|approach|system|solution|architecture|method|design)\b[^.]*\.",
+    r"ensuring\s[^.]*\.",
+    r"while maintaining\s[^.]*\.",
+    r"enhances\s[^.]*\.",
+    r"This (ensures|allows|enables|provides|helps|makes|improves)\s[^.]*\.",
+    r"for (better|improved|efficient|effective|optimal)\s[^.]*\.",
+]
 # =========================
 # REQUEST MODEL
 @app.post("/generate-summary")
 def generate_summary(req: SummaryRequest):
+    old_memory        = req.old_memory.strip() if req.old_memory.strip() else "(none)"
+    user_message      = req.user_message.strip()
+    assistant_message = req.assistant_message.strip()[:500]
     user_content = f"""EXISTING MEMORY: {old_memory}
 USER SAID: {user_message}
     ).strip()
     # =========================
+    # CLEAN — stop phrases
     # =========================
     stop_phrases = [
+        "<|im_end|>", "<|endoftext|>",
+        "UPDATED MEMORY:", "EXISTING MEMORY:",
+        "USER SAID:", "ASSISTANT REPLIED:",
+        "STRICT RULES:", "EXAMPLE ",
+        "Assistant:", "User:",
     ]
     for phrase in stop_phrases:
         if phrase in result:
             result = result.split(phrase)[0].strip()
+    # =========================
+    # CLEAN — strip filler
+    # =========================
+    for pattern in FILLER_PATTERNS:
+        result = re.sub(pattern, "", result, flags=re.IGNORECASE)
+    # =========================
+    # CLEAN — deduplicate lines
+    # =========================
     seen, lines = set(), []
     for line in result.splitlines():
     result = " ".join(lines).strip()
+    # =========================
+    # CLEAN — fix double spaces
+    # =========================
+    result = re.sub(r"\s{2,}", " ", result).strip()
     return {"memory": result}
 # =========================
 @app.get("/")
 def root():
+    return {
+        "status": "Memory Summarizer Running 🚀",
+        "model": MODEL_ID,
+        "device": device.upper()
+    }
 # =========================
 # RUN