from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re
import uvicorn

# =========================
# APP
# =========================

app = FastAPI()

# =========================
# MODEL
# =========================

MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"

print(f"🚀 Loading Memory Summarizer — {MODEL_ID}")

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    dtype=torch.float16 if device == "cuda" else torch.float32,
)
model = model.to(device)

print(f"✅ Loaded on {device.upper()}")

# =========================
# SYSTEM PROMPT
# =========================

SYSTEM_PROMPT = """You are a memory compression engine. Compress and merge facts into dense paragraphs.

EXAMPLE 1:
EXISTING MEMORY: (none)
USER SAID: I am building a weather app using React and OpenWeatherMap API.
ASSISTANT REPLIED: Fetch data with axios. Store API key in .env via process.env.
UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env via process.env.

EXAMPLE 2:
EXISTING MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key stored in .env via process.env.
USER SAID: How do I cache the weather data so I do not hit the API limit?
ASSISTANT REPLIED: Use localStorage to cache responses with a timestamp. If cache is under 10 minutes old, return it instead of calling the API.
UPDATED MEMORY: User building React weather app using OpenWeatherMap API. Data fetched via axios. API key in .env. Responses cached in localStorage with 10-minute timestamp expiry to avoid API rate limit.

EXAMPLE 3:
EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location.
USER SAID: How do job seekers apply for a job?
ASSISTANT REPLIED: Create Application model with ForeignKey to Job and User, status field (applied, reviewed, rejected, accepted), resume FileField stored in S3.
UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field (applied/reviewed/rejected/accepted), resume FileField stored in S3.

EXAMPLE 4:
EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Custom user model with company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has ForeignKey to Job and User, status field (applied/reviewed/rejected/accepted), resume FileField stored in S3.
USER SAID: I want to add search and filters for title, location, and salary range.
ASSISTANT REPLIED: Use Django Q objects and django-filter. Add query params title, location, salary_min, salary_max to job list endpoint.
UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search and filtering via django-filter and Q objects on title, location, salary_min, salary_max query params.

EXAMPLE 5:
EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search and filtering via django-filter and Q objects on title, location, salary_min, salary_max query params.
USER SAID: How do I notify applicants when their application status changes?
ASSISTANT REPLIED: Use Django signals. On Application post_save, detect status change and trigger email via Celery async task using SendGrid.
UPDATED MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field (applied/reviewed/rejected/accepted) and S3 resume. Job search via django-filter and Q objects. Status change notifications triggered via Django signals on Application post_save, sending emails via Celery tasks and SendGrid.

EXAMPLE 6:
EXISTING MEMORY: User building job board with Django, React, PostgreSQL. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field (applied/reviewed/rejected/accepted) and S3 resume. Job search via django-filter and Q objects. Status change notifications triggered via Django signals on Application post_save, sending emails via Celery tasks and SendGrid.
USER SAID: How do I deploy this on a VPS?
ASSISTANT REPLIED: Docker Compose with services for Django, React, PostgreSQL, Redis, Celery. Serve Django via Gunicorn behind nginx. Certbot for SSL. Secrets in .env file.
UPDATED MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx reverse proxy, Certbot SSL, secrets in .env.

EXAMPLE 7:
EXISTING MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx reverse proxy, Certbot SSL, secrets in .env.
USER SAID: What is still left to build?
ASSISTANT REPLIED: Admin panel for moderating posts, pagination on job listings, rate limiting on API, frontend loading states and error handling.
UPDATED MEMORY: User building job board with Django, React, PostgreSQL, Redis, Celery, SendGrid. JWT auth via djangorestframework-simplejwt. Company and jobseeker roles. Job model has title, description, skills, salary range, location. Application model has status field and S3 resume. Job search via django-filter and Q objects. Status change emails via Django signals and Celery tasks. Deployed via Docker Compose with Gunicorn, nginx, Certbot SSL. Pending: admin moderation panel, pagination, API rate limiting, frontend loading states and error handling.

STRICT RULES:
- Output ONLY the updated memory. No labels. No preamble. No explanation.
- COMPRESS the existing memory. Do not copy it verbatim. Rewrite it shorter and denser.
- Keep ALL technical facts: stack, frameworks, APIs, models, field names, architecture decisions, unfinished tasks, user preferences.
- Add new facts merged in naturally, not appended as separate sentences.
- No filler: no "ensuring", "enhances", "this setup", "this approach", "in order to", "it is worth noting".
- No questions. No advice. No "you". No "I".
- Dense technical paragraph. Maximum 8 sentences."""

# =========================
# FILLER PATTERNS
# =========================

FILLER_PATTERNS = [
    r"This (setup|approach|system|solution|architecture|method|design)\b[^.]*\.",
    r"ensuring\s[^.]*\.",
    r"while maintaining\s[^.]*\.",
    r"enhances\s[^.]*\.",
    r"This (ensures|allows|enables|provides|helps|makes|improves)\s[^.]*\.",
    r"for (better|improved|efficient|effective|optimal)\s[^.]*\.",
    r"in order to\s[^.]*\.",
    r"To (enhance|improve|ensure|enable)\s[^.]*\.",
    r"It is worth noting that\s[^.]*\.",
    r"Additionally,\s*(it|this)\s[^.]*\.",
]

# =========================
# MEMORY LIMIT CONFIG
# =========================

MEMORY_SOFT_LIMIT = 1600   # ~400 tokens — compress aggressively beyond this
MEMORY_HARD_LIMIT = 2000   # ~500 tokens — absolute cap, never exceed


# =========================
# HELPERS
# =========================

def clean_assistant_message(text: str) -> str:
    """
    Strip code blocks and inline code backticks from assistant responses.
    Model picks up function/class names from prose naturally.
    Cap at 800 chars to give model more context from long responses.
    """
    # Remove full code blocks entirely
    text = re.sub(r"```[\w]*\n?.*?```", "", text, flags=re.DOTALL)

    # Remove inline code backticks but keep the text inside
    text = re.sub(r"`([^`]+)`", r"\1", text)

    # Collapse whitespace
    text = re.sub(r"\s{2,}", " ", text).strip()

    return text[:800]


def enforce_memory_limit(text: str) -> str:
    """
    Three-stage memory length enforcement.

    Stage 1 — Under 1600 chars (~400 tokens):
        Memory is healthy. Return as-is.

    Stage 2 — Between 1600 and 2000 chars (soft limit):
        Memory is getting long. Keep complete sentences
        that fit within 2000 chars. Oldest appended facts
        may be trimmed; core stack in early sentences is preserved.

    Stage 3 — Over 2000 chars (hard limit):
        Force trim to last complete sentence before 2000 chars.
        Never cuts mid-sentence.
    """
    # Stage 1 — healthy
    if len(text) <= MEMORY_SOFT_LIMIT:
        return text

    # Stage 2 — soft limit: trim to complete sentences within hard limit
    if len(text) <= MEMORY_HARD_LIMIT:
        sentences = re.split(r"(?<=[.!?])\s+", text)
        result = ""
        for sentence in sentences:
            candidate = (result + " " + sentence).strip()
            if len(candidate) <= MEMORY_HARD_LIMIT:
                result = candidate
            else:
                break
        return result.strip()

    # Stage 3 — hard limit: force trim at last period before 2000 chars
    trimmed = text[:MEMORY_HARD_LIMIT]
    last_period = trimmed.rfind(".")
    if last_period != -1:
        trimmed = trimmed[:last_period + 1]

    return trimmed.strip()


def strip_backticks(text: str) -> str:
    """Remove any backtick formatting that leaks into memory output."""
    return re.sub(r"`([^`]+)`", r"\1", text)

# =========================
# REQUEST MODEL
# =========================

class SummaryRequest(BaseModel):
    old_memory: str = ""
    user_message: str
    assistant_message: str

# =========================
# SUMMARY ENDPOINT
# =========================

@app.post("/generate-summary")
def generate_summary(req: SummaryRequest):

    old_memory        = req.old_memory.strip() if req.old_memory.strip() else "(none)"
    user_message      = req.user_message.strip()
    assistant_message = clean_assistant_message(req.assistant_message)

    user_content = f"""EXISTING MEMORY: {old_memory}
USER SAID: {user_message}
ASSISTANT REPLIED: {assistant_message}
UPDATED MEMORY:"""

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user",   "content": user_content},
    ]

    # =========================
    # FORMAT CHAT
    # =========================

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(
        text,
        return_tensors="pt"
    ).to(model.device)

    # =========================
    # GENERATE
    # =========================

    output = model.generate(
        **inputs,
        max_new_tokens=400,
        do_sample=False,
        repetition_penalty=1.15,
        eos_token_id=tokenizer.eos_token_id,
    )

    # =========================
    # DECODE
    # =========================

    result = tokenizer.decode(
        output[0][inputs.input_ids.shape[1]:],
        skip_special_tokens=True
    ).strip()

    # =========================
    # CLEAN — stop phrases
    # =========================

    stop_phrases = [
        "<|im_end|>", "<|endoftext|>",
        "UPDATED MEMORY:", "EXISTING MEMORY:",
        "USER SAID:", "ASSISTANT REPLIED:",
        "STRICT RULES:", "EXAMPLE ",
        "Assistant:", "User:",
    ]

    for phrase in stop_phrases:
        if phrase in result:
            result = result.split(phrase)[0].strip()

    # =========================
    # CLEAN — strip filler
    # =========================

    for pattern in FILLER_PATTERNS:
        result = re.sub(pattern, "", result, flags=re.IGNORECASE)

    # =========================
    # CLEAN — strip backticks
    # =========================

    result = strip_backticks(result)

    # =========================
    # CLEAN — deduplicate lines
    # =========================

    seen, lines = set(), []

    for line in result.splitlines():
        line = line.strip()
        if line and line not in seen:
            seen.add(line)
            lines.append(line)

    result = " ".join(lines).strip()
    result = re.sub(r"\s{2,}", " ", result).strip()

    # =========================
    # HARD MEMORY LENGTH CAP
    # =========================

    result = enforce_memory_limit(result)

    return {"memory": result}

# =========================
# HEALTH
# =========================

@app.get("/")
def root():
    return {
        "status": "Memory Summarizer Running 🚀",
        "model": MODEL_ID,
        "device": device.upper(),
        "memory_soft_limit": f"{MEMORY_SOFT_LIMIT} chars (~400 tokens)",
        "memory_hard_limit": f"{MEMORY_HARD_LIMIT} chars (~500 tokens)",
    }

# =========================
# RUN
# =========================

if __name__ == "__main__":
    uvicorn.run("app:app", host="0.0.0.0", port=7860)